In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [3]:
df = pd.read_csv('/kaggle/input/spam-classification-for-basic-nlp/Spam Email raw text for NLP.csv', index_col=False)

In [4]:
msgs = df.MESSAGE.astype('str').values

In [5]:
labels = df.CATEGORY.values
#y = np.zeros((len(labels), 2))
#for i in range(len(labels)):
 #   y[i, labels[i]]=1
    
#labels = y


labels.shape

(5796,)

In [6]:
pd.DataFrame(labels).value_counts()

0    3900
1    1896
Name: count, dtype: int64

In [8]:
np.random.seed(0)
np.random.shuffle(msgs)
np.random.seed(0)
np.random.shuffle(labels)

In [9]:
print(msgs.shape, labels.shape)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(msgs, labels, shuffle=True, random_state=0, train_size=0.8)

(5796,) (5796,)


In [10]:
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(Xtrain)

In [11]:
Xtokens = tokenizer.texts_to_sequences(Xtrain)

In [12]:
maxlen = np.max(list(map(lambda x : len(x), Xtokens)))

In [13]:
maxlen

14544

In [14]:
len(Xtokens[0]), len(Xtokens[1])

(73, 148)

In [15]:
class CustomDataset():
    def __init__(self, X, y, pad_sequences, maxlen=14544):
        self.msgs = X
        self.labels = y
        self.maxlen = maxlen
        self.pad_sequences = pad_sequences
    
    def __len__(self):
        return len(self.msgs)
    
    def __getitem__(self, idx):
        msg = [self.msgs[idx]]
        label = self.labels[idx]
        label = label.reshape((1))
        return {
            'msg' : torch.tensor(msg).long(),
            'target' : torch.tensor(label).float()
        }

In [17]:
Xtrainseq = np.array(Xtrainseq)
Xtrainseq, Xvalseq, Ytrain, Yval = train_test_split(Xtrainseq, Ytrain, train_size=0.8)
train_dataset = CustomDataset(Xtrainseq, Ytrain, pad_sequences, maxlen)
val_dataset = CustomDataset(Xvalseq, Yval, pad_sequences)

In [18]:
train_dataset[0]['msg'].shape

torch.Size([1, 14544])

In [19]:
Xtesttokens = tokenizer.texts_to_sequences(Xtest)

In [20]:
pd.DataFrame(Ytrain).value_counts()

0    2491
1    1217
Name: count, dtype: int64

In [24]:
test_dataset = CustomDataset(Xtestseq, Ytest, pad_sequences)

In [25]:
from tqdm.auto import tqdm
test_dataset[0]['msg'].shape

torch.Size([1, 14544])

In [26]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

In [27]:
class SimpleModel(nn.Module):
    def __init__(self, num_words, embedding_size):
        super(SimpleModel, self).__init__()
        self.embedding_size = embedding_size
        self.embed = nn.Embedding(num_words, embedding_size, max_norm=2)
        layers = [
            nn.Flatten(),
            nn.Linear(embedding_size*14544, 1),
            #nn.Dropout(p=0.2, inplace=True),
        ]
        self.model = nn.Sequential(*layers)
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        x=self.embed(x)
        out=self.sig(self.model(x))
        
        return out

In [28]:
def trainer(train_loader, model, optimizer, criterion):
    model.to('cuda').train()
    it_loss = 0
    counter = 0
    for i, data in enumerate(tqdm(train_loader, total=len(train_loader))):
        optimizer.zero_grad()
        msgs = data['msg'].to('cuda')
        targets = data['target'].to('cuda')
        out = model(msgs)
        #print(out)
        #print(out.shape)
        #targets = targets.reshape((targets.shape[0], 1))
        #print(targets.shape)
        loss = criterion(out, targets)
        #print(loss)
        loss.backward()
        optimizer.step()
        
        it_loss+=loss.item()*msgs.shape[0]
        counter+=msgs.shape[0]
        
    return it_loss/counter

In [29]:
def tester(test_loader, model, criterion):
    model.to('cuda').eval()
    it_loss=0
    counter=0
    for i, data in enumerate(tqdm(test_loader, total=len(test_loader))):
        msgs = data['msg'].to('cuda')
        targets = data['target'].to('cuda')
        with torch.no_grad():
            out = model(msgs)
            #targets = targets.reshape((targets.shape[0], 1))
            loss = criterion(out, targets)
            
            it_loss+=loss.item()*msgs.shape[0]
            counter+=msgs.shape[0]
        
    return it_loss/counter

In [46]:
model = SimpleModel(30000, 64)
model.to('cuda')
epochs = 10
lr = 1e-3
optimizer = torch.optim.RAdam(model.parameters(), lr=lr)

criterion = nn.BCELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer)
model

SimpleModel(
  (embed): Embedding(30000, 64, max_norm=2)
  (model): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=930816, out_features=1, bias=True)
  )
  (sig): Sigmoid()
)

In [47]:
from tqdm.notebook import trange
train_losses=[]
test_losses=[]
best_loss=np.inf
for epoch in trange(epochs, desc='Training model'):
    #print(f'\nBegan iteration {epoch+1}')
    train_loss = trainer(train_loader, model, optimizer, criterion)
    test_loss = tester(val_loader, model, criterion)
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    scheduler.step(test_loss)
    print(f'Test Loss : {test_loss}')
    if test_loss<best_loss:
        best_loss=test_loss
        dic={
            'model':model.state_dict()
        }
        torch.save(dic, './Bestmodel.model')
        print('Improved and saved the model')

#ignore the render error in this cell

Training model:   0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 1.653956335166405
Improved and saved the model


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 0.8013249923442972
Improved and saved the model


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 2.2359741843979934


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 0.1241029484904018
Improved and saved the model


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 0.11597745788508448
Improved and saved the model


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 0.08075145061994934
Improved and saved the model


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 0.05501894787724676
Improved and saved the model


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 0.2798805814878694


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 0.0559190696904211


  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

Test Loss : 0.11516841318345918


In [57]:
check_point = torch.load('./Bestmodel.model')
skand = SimpleModel(30000, 64)
skand.to('cuda')
skand.load_state_dict(check_point['model'])

<All keys matched successfully>

In [58]:

skand.to('cuda').eval()
pred=[]
for i, data in enumerate(tqdm(test_loader, total=len(test_loader))):
    msgs = data['msg'].to('cuda')
    with torch.no_grad():
        out = skand(msgs)
        pred.append(out.to('cpu').numpy())
        
pred = np.concatenate(pred)    
#pred = pred.argmax(1)
#ignore the render error in this cell

  0%|          | 0/37 [00:00<?, ?it/s]

In [59]:
pred

array([[9.9754459e-01],
       [1.6074949e-03],
       [8.1415297e-03],
       ...,
       [9.5807598e-04],
       [3.7896209e-03],
       [1.0000000e+00]], dtype=float32)

In [62]:
count=0
for i in range(len(pred)):
    if (pred[i]>=0.5 and Ytest[i]==1):
        count+=1
print(count)
pd.DataFrame(Ytest).value_counts()

362


0    782
1    378
Name: count, dtype: int64

In [16]:
Xtrainseq = pad_sequences(Xtokens, maxlen=maxlen, padding='post')

In [21]:
Xtestseq = pad_sequences(Xtesttokens, maxlen=maxlen, padding='post')

In [22]:
Xtrainseq.shape

(3708, 14544)

In [23]:
Xtestseq.shape

(1160, 14544)

In [32]:
import tensorflow as tf
inputs = tf.keras.Input(shape=(13006, ))
embedding = tf.keras.layers.Embedding( 
    input_dim = 30000,
    output_dim = 64
)(inputs)
flatten = tf.keras.layers.Flatten()(embedding)
outputs = tf.keras.layers.Dense(2, activation='sigmoid')(flatten)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss = 'binary_crossentropy',
    metrics = [
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

print(model.summary())

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 13006)]           0         
                                                                 
 embedding_3 (Embedding)     (None, 13006, 64)         1920000   
                                                                 
 flatten_3 (Flatten)         (None, 832384)            0         
                                                                 
 dense_2 (Dense)             (None, 2)                 1664770   
                                                                 
Total params: 3584770 (13.67 MB)
Trainable params: 3584770 (13.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [33]:
history = model.fit(Xtrainseq, Ytrain, validation_split=0.2, batch_size=32, epochs=100,
                   callbacks=[
                       tf.keras.callbacks.EarlyStopping(
                       monitor = 'val_loss',
                       patience=3,
                       restore_best_weights=True)
                   ])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [36]:
results = model.evaluate(Xtestseq, Ytest, verbose=0)


In [97]:
print(f'val_loss : {results[0]}\naccuracy : {results[1]*100}%\nauc : {results[2]}')

val_loss : 0.029011981561779976
accuracy : 99.48245882987976%
auc : 0.998275637626648


In [38]:
with open('./kerasModel.model', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [53]:
with open('./kerasTokenizer.Tokenizer', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [54]:
with open('./sequencePadder.PadSequences', 'wb') as handle:
    pickle.dump(pad_sequences, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [63]:
with open('./tuner.tuner', 'wb') as handle:
    pickle.dump({ 
        'maxlen':14544,
        'num_words' : 30000,
        'embedding_size':64
    }, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
pred = model.predict(Xtestseq)



In [43]:
pred 

array([[1.4426785e-04, 9.9984843e-01],
       [7.6759284e-07, 9.9999923e-01],
       [9.9859142e-01, 1.3892701e-03],
       ...,
       [9.9996293e-01, 3.8821665e-05],
       [9.9363053e-01, 6.3473103e-03],
       [9.9924135e-01, 7.5421837e-04]], dtype=float32)