In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'google.protobuf'

In [2]:
df = pd.read_csv('/kaggle/input/spam-classification-for-basic-nlp/Spam Email raw text for NLP.csv', index_col=False)

In [3]:
msgs = df.MESSAGE.astype('str').values

In [4]:
labels = df.CATEGORY.values
y = np.zeros((len(labels), 2))
for i in range(len(labels)):
    y[i, labels[i]]=1
    
labels = y


labels.shape

(5796, 2)

In [5]:
print(msgs.shape, labels.shape)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(msgs, labels, shuffle=True, random_state=0, train_size=0.7)

(5796,) (5796, 2)


In [17]:
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(Xtrain)

In [18]:
Xtokens = tokenizer.texts_to_sequences(Xtrain)

In [19]:
maxlen = np.max(list(map(lambda x : len(x), Xtokens)))

In [67]:
maxlen

13006

In [21]:
len(Xtokens[0]), len(Xtokens[1])

(68, 262)

In [155]:
class CustomDataset():
    def __init__(self, X, y, pad_sequences, maxlen=13006):
        self.msgs = X
        self.labels = y
        self.maxlen = maxlen
        self.pad_sequences = pad_sequences
    
    def __len__(self):
        return len(self.msgs)
    
    def __getitem__(self, idx):
        msg = [self.msgs[idx]]
        label = self.labels[idx]
        
        return {
            'msg' : torch.tensor(msg).long(),
            'target' : torch.tensor(label).float()
        }

In [158]:
Xtrainseq = np.array(Xtrainseq)
train_dataset = CustomDataset(Xtrainseq, Ytrain, pad_sequences, maxlen)

In [159]:
train_dataset[0]['msg'].shape

torch.Size([1, 13006])

In [75]:
Xtesttokens = tokenizer.texts_to_sequences(Xtest)

In [162]:
pd.DataFrame(Ytrain).value_counts()

0    1  
1.0  0.0    2723
0.0  1.0    1334
Name: count, dtype: int64

In [163]:
test_dataset = CustomDataset(Xtestseq, Ytest, pad_sequences)

In [164]:
test_dataset[0]['msg'].shape

torch.Size([1, 13006])

In [166]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

In [338]:
class SimpleModel(nn.Module):
    def __init__(self, num_words, embedding_size):
        super(SimpleModel, self).__init__()
        self.embedding_size = embedding_size
        self.embed = nn.Embedding(num_words, embedding_size, max_norm=True).requires_grad_(True)
        self.init_embed()
        layers = [
            nn.Flatten(),
            nn.Linear(embedding_size*13006, 2),
            nn.Sigmoid()
        ]
        self.model = nn.Sequential(*layers)
        
    def init_embed(self):
        initrange = 0.75 / self.embedding_size
        self.embed.weight.data.uniform_(-initrange, initrange)
    
    def forward(self, x):
        x = self.embed(x)
        out=self.model(x)
        
        return out

In [339]:
def trainer(train_loader, model, optimizer, criterion):
    model.train()
    it_loss = 0
    counter = 0
    for data in train_loader:
        msgs = data['msg']
        targets = data['target']
        out = model(msgs)
        print(out)
        #print(out)
        #targets = targets.reshape((targets.shape[0], 1))
        #print(targets)
        loss = criterion(out, targets)
        #print(loss)
        loss.backward()
        optimizer.step()
        
        it_loss+=loss.item()*msgs.shape[0]
        counter+=msgs.shape[0]
        
    return it_loss/counter

In [340]:
def tester(test_loader, model, criterion):
    model.eval()
    it_loss=0
    counter=0
    for data in test_loader:
        msgs = data['msg']
        targets = data['target']
        with torch.no_grad():
            out = model(msgs)
            #targets = targets.reshape((targets.shape[0], 1))
            loss = criterion(out, targets)
            
            it_loss+=loss.item()*msgs.shape[0]
            counter+=msgs.shape[0]
        
    return it_loss/counter

In [344]:
!pip install torchsummary

[0m^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [346]:
model = SimpleModel(30000, 64)
epochs = 10
lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.9)


In [342]:
train_losses=[]
test_losses=[]
best_loss=np.inf
for epoch in range(epochs):
    print(f'\nBegan iteration {epoch+1}')
    train_loss = trainer(train_loader, model, optimizer, criterion)
    test_loss = tester(test_loader, model, criterion)
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    #scheduler.step()
    print(f'Test Loss : {test_loss}')
    if test_loss<best_loss:
        best_loss=test_loss
        dic={
            'model':model.state_dict()
        }
        torch.save(dic, './Bestmodel.model')
        print('Improved and saved the model')


Began iteration 1
tensor([[0.5007, 0.5000],
        [0.5005, 0.4998],
        [0.5007, 0.5000],
        [0.5009, 0.4999],
        [0.5006, 0.4999],
        [0.5007, 0.4996],
        [0.4999, 0.4992],
        [0.5006, 0.4998],
        [0.5006, 0.4998],
        [0.5005, 0.4998],
        [0.5008, 0.4998],
        [0.5007, 0.4998],
        [0.5005, 0.4998],
        [0.5006, 0.5000],
        [0.5005, 0.4997],
        [0.4999, 0.5001],
        [0.5006, 0.4999],
        [0.5004, 0.4997],
        [0.4995, 0.4994],
        [0.5007, 0.4998],
        [0.5007, 0.4999],
        [0.5007, 0.5000],
        [0.5001, 0.4992],
        [0.5005, 0.4998],
        [0.5005, 0.4999],
        [0.5012, 0.4996],
        [0.5006, 0.4999],
        [0.5006, 0.4999],
        [0.5006, 0.4998],
        [0.5006, 0.4997],
        [0.5006, 0.4998],
        [0.5006, 0.4997]], grad_fn=<SigmoidBackward0>)
tensor([[0.9910, 0.0090],
        [0.9911, 0.0089],
        [0.9890, 0.0110],
        [0.9913, 0.0088],
        [0.9883,

KeyboardInterrupt: 

In [93]:
model.eval()
pred=[]
for data in test_loader:
    msgs = data['msg']
    with torch.no_grad():
        out = model(msgs)
        pred.append(out.to('cpu').numpy())
        
pred = np.concatenate(pred)    
#pred = pred.argmax(1)

In [94]:
pred

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [95]:
pd.DataFrame(pred).value_counts()

0    1  
1.0  0.0    1739
Name: count, dtype: int64

In [23]:
Xtrainseq = pad_sequences(Xtokens, maxlen=maxlen, padding='post')

In [24]:
Xtestseq = pad_sequences(Xtesttokens, maxlen=maxlen, padding='post')

In [25]:
Xtrainseq.shape

(4057, 13006)

In [26]:
Xtestseq.shape

(1739, 13006)

In [32]:
import tensorflow as tf
inputs = tf.keras.Input(shape=(13006, ))
embedding = tf.keras.layers.Embedding( 
    input_dim = 30000,
    output_dim = 64
)(inputs)
flatten = tf.keras.layers.Flatten()(embedding)
outputs = tf.keras.layers.Dense(2, activation='sigmoid')(flatten)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss = 'binary_crossentropy',
    metrics = [
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

print(model.summary())

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 13006)]           0         
                                                                 
 embedding_3 (Embedding)     (None, 13006, 64)         1920000   
                                                                 
 flatten_3 (Flatten)         (None, 832384)            0         
                                                                 
 dense_2 (Dense)             (None, 2)                 1664770   
                                                                 
Total params: 3584770 (13.67 MB)
Trainable params: 3584770 (13.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [33]:
history = model.fit(Xtrainseq, Ytrain, validation_split=0.2, batch_size=32, epochs=100,
                   callbacks=[
                       tf.keras.callbacks.EarlyStopping(
                       monitor = 'val_loss',
                       patience=3,
                       restore_best_weights=True)
                   ])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [36]:
results = model.evaluate(Xtestseq, Ytest, verbose=0)


In [97]:
print(f'val_loss : {results[0]}\naccuracy : {results[1]*100}%\nauc : {results[2]}')

val_loss : 0.029011981561779976
accuracy : 99.48245882987976%
auc : 0.998275637626648


In [38]:
with open('./kerasModel.model', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
with open('./kerasModel.model', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
with open('./tuner.tuner', 'wb') as handle:
    pickle.dump({ 
        'maxlen':13006,
        'num_words' : 30000
    }, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
pred = model.predict(Xtestseq)



In [43]:
pred 

array([[1.4426785e-04, 9.9984843e-01],
       [7.6759284e-07, 9.9999923e-01],
       [9.9859142e-01, 1.3892701e-03],
       ...,
       [9.9996293e-01, 3.8821665e-05],
       [9.9363053e-01, 6.3473103e-03],
       [9.9924135e-01, 7.5421837e-04]], dtype=float32)

In [44]:
?nn.Linear

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mLinear[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0min_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mout_features[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Applies a linear transformation to the incoming data: :math:`y = xA^T + b`

This module supports :ref:`TensorFloat32<tf32_on_ampere>`.

On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

Args:
    in_features: size of each input sample
    out_features: size of each output sample
    bias: If set t

In [108]:
embed = nn.Embedding(30000, 100)

In [286]:
?embed

[0;31mSignature:[0m      [0membed[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           Embedding
[0;31mString form:[0m    Embedding(30000, 100)
[0;31mFile:[0m           /opt/conda/lib/python3.10/site-packages/torch/nn/modules/sparse.py
[0;31mDocstring:[0m     
A simple lookup table that stores embeddings of a fixed dictionary and size.

This module is often used to store word embeddings and retrieve them using indices.
The input to the module is a list of indices, and the output is the corresponding
word embeddings.

Args:
    num_embeddings (int): size of the dictionary of embeddings
    embedding_dim (int): the size of each embedding vector
    padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
                                 therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
           

In [109]:
embed = embed.requires_grad_(True)

In [137]:
Xtrainseq.shape

(4057, 13006)

In [187]:
out1 = embed(torch.tensor(Xtrainseq[33:65, :]))

In [285]:
out1[1,...]

tensor([[ 1.7669, -0.7957,  1.1804,  ...,  1.8686,  0.0808, -0.3324],
        [ 1.0841,  1.7009,  0.5163,  ...,  0.8654, -0.3148, -0.7324],
        [-1.6359, -1.1111, -0.7735,  ..., -1.7328,  1.7643, -0.2569],
        ...,
        [-0.0657, -0.5851, -1.7279,  ..., -0.8922,  1.3751,  0.3412],
        [-0.0657, -0.5851, -1.7279,  ..., -0.8922,  1.3751,  0.3412],
        [-0.0657, -0.5851, -1.7279,  ..., -0.8922,  1.3751,  0.3412]],
       grad_fn=<SelectBackward0>)

In [141]:
ff = nn.Flatten()

In [190]:
out2 = ff(out1)

In [193]:
out2

tensor([[-0.4139,  0.6069,  0.4648,  ..., -0.8922,  1.3751,  0.3412],
        [ 1.7669, -0.7957,  1.1804,  ..., -0.8922,  1.3751,  0.3412],
        [-0.2385, -0.4048, -0.3093,  ..., -0.8922,  1.3751,  0.3412],
        ...,
        [ 0.5774,  0.0971, -0.7322,  ..., -0.8922,  1.3751,  0.3412],
        [-1.2598, -0.5892, -0.5497,  ..., -0.8922,  1.3751,  0.3412],
        [-1.0076,  1.1179,  0.0841,  ..., -0.8922,  1.3751,  0.3412]],
       grad_fn=<ReshapeAliasBackward0>)

In [146]:
out2.shape

torch.Size([10, 1300600])

In [147]:
fc=nn.Linear(1300600, 2)

In [194]:
out3=fc(out2)

In [195]:
out3

tensor([[-0.6469, -0.6326],
        [-0.4534, -0.6711],
        [-0.5873, -0.6424],
        [-0.4347, -0.9044],
        [-0.6438, -0.6197],
        [-0.5014, -0.6910],
        [-0.1932, -0.3577],
        [-0.5487, -0.5274],
        [-0.5428, -0.6081],
        [-0.6678, -0.6045],
        [-0.5750, -0.6395],
        [-0.4702, -0.7442],
        [-0.4611, -0.4308],
        [-0.4572, -0.6268],
        [-0.6093, -0.7042],
        [-0.3012, -0.6205],
        [-0.4687, -0.6614],
        [-0.5496, -0.6504],
        [-0.4975, -0.6318],
        [-0.6055, -0.6654],
        [-0.5533, -0.6290],
        [-0.6643, -0.5860],
        [-0.5819, -0.6686],
        [-0.0346, -0.3917],
        [-0.4988, -0.8273],
        [-0.6047, -0.5693],
        [-0.5663, -0.6921],
        [-0.5819, -0.6199],
        [-0.3686, -0.6514],
        [-0.5357, -0.6136],
        [-0.4437, -0.6501],
        [-0.4342, -0.7104]], grad_fn=<AddmmBackward0>)

In [196]:
out3.shape

torch.Size([32, 2])

In [197]:
out4 = nn.Softmax(dim=1)(out3)

In [198]:
out4

tensor([[0.4964, 0.5036],
        [0.5542, 0.4458],
        [0.5137, 0.4863],
        [0.6153, 0.3847],
        [0.4940, 0.5060],
        [0.5473, 0.4527],
        [0.5410, 0.4590],
        [0.4947, 0.5053],
        [0.5163, 0.4837],
        [0.4842, 0.5158],
        [0.5161, 0.4839],
        [0.5681, 0.4319],
        [0.4924, 0.5076],
        [0.5423, 0.4577],
        [0.5237, 0.4763],
        [0.5791, 0.4209],
        [0.5480, 0.4520],
        [0.5252, 0.4748],
        [0.5335, 0.4665],
        [0.5150, 0.4850],
        [0.5189, 0.4811],
        [0.4804, 0.5196],
        [0.5217, 0.4783],
        [0.5883, 0.4117],
        [0.5814, 0.4186],
        [0.4911, 0.5089],
        [0.5314, 0.4686],
        [0.5095, 0.4905],
        [0.5703, 0.4297],
        [0.5195, 0.4805],
        [0.5514, 0.4486],
        [0.5686, 0.4314]], grad_fn=<SoftmaxBackward0>)

In [311]:
trial = nn.Sequential(*[
    nn.Embedding(30000, 100, max_norm=True).requires_grad_(True),
    nn.Flatten(),
    nn.Linear(1300600, 2)
])

In [312]:
ada = torch.optim.SGD(trial.parameters(), lr=0.001)
cri = nn.CrossEntropyLoss()

In [313]:
#1st mini batch
out = trial(torch.tensor(Xtrainseq[:32, :]))
out = nn.Sigmoid()(out)

In [314]:
out

tensor([[0.5019, 0.5001],
        [0.5066, 0.4969],
        [0.5035, 0.4990],
        [0.5022, 0.4967],
        [0.5047, 0.4988],
        [0.4991, 0.4994],
        [0.4980, 0.4944],
        [0.5063, 0.4988],
        [0.5029, 0.5068],
        [0.5039, 0.4971],
        [0.5049, 0.4983],
        [0.5025, 0.4972],
        [0.5035, 0.4961],
        [0.5023, 0.4959],
        [0.4973, 0.4985],
        [0.4931, 0.4929],
        [0.5046, 0.4965],
        [0.5019, 0.5012],
        [0.4962, 0.4939],
        [0.5030, 0.4984],
        [0.5046, 0.4979],
        [0.5044, 0.4981],
        [0.5024, 0.5017],
        [0.5008, 0.4988],
        [0.5015, 0.5000],
        [0.5022, 0.5099],
        [0.5042, 0.4985],
        [0.5034, 0.4976],
        [0.5039, 0.4975],
        [0.5038, 0.5002],
        [0.5052, 0.4992],
        [0.5101, 0.4997]], grad_fn=<SigmoidBackward0>)

In [315]:
l = cri(out, torch.tensor(Ytest[:32,:]))

In [316]:
l

tensor(0.6922, dtype=torch.float64, grad_fn=<DivBackward1>)

In [317]:
l.backward()

In [318]:
ada.step()

In [319]:
#2nd mini batch
out = trial(torch.tensor(Xtrainseq[32:64, :]))

out = nn.Sigmoid()(out)

In [320]:
out #the fuck

tensor([[0.7306, 0.2755],
        [0.7291, 0.2734],
        [0.7177, 0.2826],
        [0.7274, 0.2728],
        [0.7142, 0.2821],
        [0.7257, 0.2756],
        [0.7264, 0.2728],
        [0.7131, 0.2881],
        [0.7178, 0.2835],
        [0.7283, 0.2735],
        [0.7284, 0.2717],
        [0.7292, 0.2723],
        [0.7271, 0.2736],
        [0.7218, 0.2812],
        [0.7264, 0.2783],
        [0.7270, 0.2708],
        [0.7259, 0.2769],
        [0.7277, 0.2732],
        [0.7282, 0.2717],
        [0.7250, 0.2776],
        [0.7298, 0.2723],
        [0.7266, 0.2739],
        [0.7120, 0.2922],
        [0.7292, 0.2722],
        [0.7186, 0.2889],
        [0.7233, 0.2784],
        [0.7274, 0.2750],
        [0.7279, 0.2723],
        [0.7290, 0.2721],
        [0.7104, 0.2831],
        [0.7302, 0.2715],
        [0.7285, 0.2748]], grad_fn=<SigmoidBackward0>)

In [321]:
l=cri(out, torch.tensor(Ytest[32:64,:]))

In [322]:
l

tensor(0.6182, dtype=torch.float64, grad_fn=<DivBackward1>)

In [323]:
l.backward()

In [324]:
ada.step()

In [325]:
#3rd mini batch
out = trial(torch.tensor(Xtrainseq[64:96, :]))

out = nn.Sigmoid()(out)

In [326]:
out

tensor([[0.8997, 0.1000],
        [0.8736, 0.1220],
        [0.9037, 0.0972],
        [0.9026, 0.0978],
        [0.9031, 0.0969],
        [0.9035, 0.0972],
        [0.8986, 0.1021],
        [0.9027, 0.0972],
        [0.9036, 0.0972],
        [0.9029, 0.0960],
        [0.9035, 0.0964],
        [0.9032, 0.0965],
        [0.9032, 0.0973],
        [0.9034, 0.0973],
        [0.8977, 0.1015],
        [0.8833, 0.1182],
        [0.9040, 0.0962],
        [0.9038, 0.0969],
        [0.9036, 0.0971],
        [0.9035, 0.0979],
        [0.9025, 0.0987],
        [0.9034, 0.0965],
        [0.8957, 0.1044],
        [0.9037, 0.0970],
        [0.9036, 0.0969],
        [0.9038, 0.0963],
        [0.9028, 0.0965],
        [0.8973, 0.1045],
        [0.9012, 0.1012],
        [0.9025, 0.0979],
        [0.9040, 0.0966],
        [0.8940, 0.1040]], grad_fn=<SigmoidBackward0>)

In [327]:
l = cri(out, torch.tensor(Ytest[64:96,:]))

In [328]:
l

tensor(0.7729, dtype=torch.float64, grad_fn=<DivBackward1>)

In [329]:
l.backward()

In [330]:
ada.step()

In [331]:
#4th
out = trial(torch.tensor(Xtrainseq[96:128, :]))

out = nn.Sigmoid()(out)

In [309]:
out

tensor([[0.9631, 0.0343],
        [0.9631, 0.0344],
        [0.9621, 0.0343],
        [0.9621, 0.0348],
        [0.9623, 0.0349],
        [0.9627, 0.0348],
        [0.9617, 0.0353],
        [0.9630, 0.0345],
        [0.9610, 0.0358],
        [0.9617, 0.0354],
        [0.9627, 0.0343],
        [0.9619, 0.0350],
        [0.9629, 0.0342],
        [0.9622, 0.0353],
        [0.9588, 0.0385],
        [0.9617, 0.0349],
        [0.9624, 0.0342],
        [0.9624, 0.0351],
        [0.9628, 0.0349],
        [0.9625, 0.0346],
        [0.9629, 0.0344],
        [0.9542, 0.0433],
        [0.9617, 0.0356],
        [0.9574, 0.0403],
        [0.9621, 0.0346],
        [0.9619, 0.0349],
        [0.9604, 0.0359],
        [0.9617, 0.0352],
        [0.9630, 0.0343],
        [0.9618, 0.0360],
        [0.9625, 0.0345],
        [0.9624, 0.0348]], grad_fn=<SigmoidBackward0>)