In [2]:
import pandas as pd
import numpy as np
import seaborn as sn

import torch, torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, Dataset

import torchtext
from torchtext import vocab
from torchtext.vocab import build_vocab_from_iterator

import itertools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from gensim.models import Word2Vec
import gensim

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows=100
pd.options.display.max_columns=100


In [None]:
SEED = 100

torch.manual_seed(SEED)

In [3]:
data = pd.read_csv('spam.csv',delimiter=',',encoding='latin-1')

In [4]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [6]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [8]:
X = data.v2
Y = data.v1
#Y = np.where(Y=='spam',1,0)
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [10]:
Y

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,stratify=Y, test_size=.15)

In [12]:
X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

In [13]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((4736,), (836,), (4736, 1), (836, 1))

## Torch DataLoader

In [14]:
X_train_np= X_train.to_numpy()
X_test_np = X_test.to_numpy()

In [15]:
type(X_train_np), type(X_test_np)

(numpy.ndarray, numpy.ndarray)

In [16]:
train_dataset = list(zip(X_train_np,Y_train))
test_dataset = list(zip(X_test_np,Y_test))

In [17]:
train_dataset[0:1]

[('I wan but too early lei... Me outside now wun b home so early... Neva mind then...',
  array([0]))]

In [19]:
embed_dim=300

In [20]:
## you can load vocab from existing models like Glove,etc.
glove_vocab = torchtext.vocab.GloVe(name='6B', dim=embed_dim) ## source: https://pytorch.org/text/stable/_modules/torchtext/vocab/vectors.html#GloVe

# and the rest is same

.vector_cache/glove.6B.zip: 862MB [03:08, 4.57MB/s]                           
100%|█████████▉| 399999/400000 [01:00<00:00, 6641.59it/s]


In [21]:
glove_vocab

<torchtext.vocab.vectors.GloVe at 0x7fb8cefd5d20>

In [22]:
glove_vocab.get_vecs_by_tokens('the')

tensor([ 4.6560e-02,  2.1318e-01, -7.4364e-03, -4.5854e-01, -3.5639e-02,
         2.3643e-01, -2.8836e-01,  2.1521e-01, -1.3486e-01, -1.6413e+00,
        -2.6091e-01,  3.2434e-02,  5.6621e-02, -4.3296e-02, -2.1672e-02,
         2.2476e-01, -7.5129e-02, -6.7018e-02, -1.4247e-01,  3.8825e-02,
        -1.8951e-01,  2.9977e-01,  3.9305e-01,  1.7887e-01, -1.7343e-01,
        -2.1178e-01,  2.3617e-01, -6.3681e-02, -4.2318e-01, -1.1661e-01,
         9.3754e-02,  1.7296e-01, -3.3073e-01,  4.9112e-01, -6.8995e-01,
        -9.2462e-02,  2.4742e-01, -1.7991e-01,  9.7908e-02,  8.3118e-02,
         1.5299e-01, -2.7276e-01, -3.8934e-02,  5.4453e-01,  5.3737e-01,
         2.9105e-01, -7.3514e-03,  4.7880e-02, -4.0760e-01, -2.6759e-02,
         1.7919e-01,  1.0977e-02, -1.0963e-01, -2.6395e-01,  7.3990e-02,
         2.6236e-01, -1.5080e-01,  3.4623e-01,  2.5758e-01,  1.1971e-01,
        -3.7135e-02, -7.1593e-02,  4.3898e-01, -4.0764e-02,  1.6425e-02,
        -4.4640e-01,  1.7197e-01,  4.6246e-02,  5.8

In [23]:


def simple_prep_process( data): ## lowercases, tokenizes, and de-accents the text
        return data.apply(lambda x:  gensim.utils.simple_preprocess(x, deacc=True)) ## deacc is true to de-accent


In [296]:
X_train_processed = simple_prep_process(X_train)

In [25]:

model = Word2Vec(sentences=X_train_processed, window=5, min_count=1, sg=0, vector_size =300)

In [26]:
len(model.wv.key_to_index) ##

7085

In [136]:
model.wv.key_to_index

{'to': 0,
 'you': 1,
 'the': 2,
 'and': 3,
 'in': 4,
 'is': 5,
 'me': 6,
 'my': 7,
 'it': 8,
 'for': 9,
 'your': 10,
 'call': 11,
 'of': 12,
 'have': 13,
 'that': 14,
 'on': 15,
 'are': 16,
 'now': 17,
 'can': 18,
 'so': 19,
 'but': 20,
 'not': 21,
 'or': 22,
 'we': 23,
 'do': 24,
 'get': 25,
 'at': 26,
 'be': 27,
 'will': 28,
 'ur': 29,
 'no': 30,
 'if': 31,
 'just': 32,
 'with': 33,
 'this': 34,
 'gt': 35,
 'lt': 36,
 'how': 37,
 'up': 38,
 'what': 39,
 'when': 40,
 'free': 41,
 'ok': 42,
 'from': 43,
 'go': 44,
 'out': 45,
 'all': 46,
 'll': 47,
 'know': 48,
 'then': 49,
 'good': 50,
 'am': 51,
 'got': 52,
 'like': 53,
 'he': 54,
 'come': 55,
 'day': 56,
 'time': 57,
 'was': 58,
 'only': 59,
 'its': 60,
 'love': 61,
 'send': 62,
 'there': 63,
 'text': 64,
 'want': 65,
 'as': 66,
 'txt': 67,
 'going': 68,
 'need': 69,
 'sorry': 70,
 'see': 71,
 'by': 72,
 'stop': 73,
 'about': 74,
 'lor': 75,
 'one': 76,
 'home': 77,
 'today': 78,
 'da': 79,
 'don': 80,
 'back': 81,
 'she': 82,
 'sti

In [27]:
tok = torchtext.data.utils.get_tokenizer("basic_english")

In [28]:
tok(X_train[0])

['i',
 'wan',
 'but',
 'too',
 'early',
 'lei',
 '.',
 '.',
 '.',
 'me',
 'outside',
 'now',
 'wun',
 'b',
 'home',
 'so',
 'early',
 '.',
 '.',
 '.',
 'neva',
 'mind',
 'then',
 '.',
 '.',
 '.']

In [238]:
train_vocab = model.wv.key_to_index

In [239]:
train_vocab

{'to': 0,
 'you': 1,
 'the': 2,
 'and': 3,
 'in': 4,
 'is': 5,
 'me': 6,
 'my': 7,
 'it': 8,
 'for': 9,
 'your': 10,
 'call': 11,
 'of': 12,
 'have': 13,
 'that': 14,
 'on': 15,
 'are': 16,
 'now': 17,
 'can': 18,
 'so': 19,
 'but': 20,
 'not': 21,
 'or': 22,
 'we': 23,
 'do': 24,
 'get': 25,
 'at': 26,
 'be': 27,
 'will': 28,
 'ur': 29,
 'no': 30,
 'if': 31,
 'just': 32,
 'with': 33,
 'this': 34,
 'gt': 35,
 'lt': 36,
 'how': 37,
 'up': 38,
 'what': 39,
 'when': 40,
 'free': 41,
 'ok': 42,
 'from': 43,
 'go': 44,
 'out': 45,
 'all': 46,
 'll': 47,
 'know': 48,
 'then': 49,
 'good': 50,
 'am': 51,
 'got': 52,
 'like': 53,
 'he': 54,
 'come': 55,
 'day': 56,
 'time': 57,
 'was': 58,
 'only': 59,
 'its': 60,
 'love': 61,
 'send': 62,
 'there': 63,
 'text': 64,
 'want': 65,
 'as': 66,
 'txt': 67,
 'going': 68,
 'need': 69,
 'sorry': 70,
 'see': 71,
 'by': 72,
 'stop': 73,
 'about': 74,
 'lor': 75,
 'one': 76,
 'home': 77,
 'today': 78,
 'da': 79,
 'don': 80,
 'back': 81,
 'she': 82,
 'sti

In [240]:
max(train_vocab.values())

7084

In [241]:
list(train_vocab.values())[7084]

7084

In [248]:
matching_key = next((key for key, value in train_vocab.items() if value == 7084), None)
matching_key

'conected'

In [246]:
glove_vocab.get_vecs_by_tokens("conected")

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [50]:
#### Getting the embedding of each word in the vocabulary and also taking care of unknown words and padding

In [138]:
matrix_len
# it starts from 0 to 7084

7085

In [250]:
matrix_len = len(train_vocab)
weight_matrix = np.zeros((matrix_len+2,embed_dim))
# Now, here I have put 2 extra tokens: 7085--> unknown and 7086----> padding
words_found = 0

for i, word in enumerate(train_vocab):
        # print(i)
        try:
            weight_matrix[i]=glove_vocab.get_vecs_by_tokens(word)
            words_found+=1
        except KeyError:
            weight_matrix[i]= np.random.normal(scale=0.6, size=(embed_dim,))


In [251]:
weight_matrix.shape
# 2 extra tokens are unknwn and padding

(7087, 300)

In [252]:
weight_matrix[7086],weight_matrix[7085]
# Last 2 weights are 0 ---> for 7085 and 7086

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 

In [253]:
list(train_vocab.keys())[12]

'of'

In [254]:
glove_vocab.get_vecs_by_tokens(list(train_vocab.keys())[12]).shape,glove_vocab.get_vecs_by_tokens(list(train_vocab.keys())[12])[0:10]

(torch.Size([300]),
 tensor([-0.0769, -0.0212,  0.2127, -0.7223, -0.1399, -0.1223, -0.1752,  0.1214,
         -0.0709, -1.5721]))

In [255]:
weight_matrix[12][0:10]

array([-0.076947  , -0.021211  ,  0.21270999, -0.72232002, -0.13988   ,
       -0.12234   , -0.17521   ,  0.12137   , -0.070866  , -1.57210004])

In [256]:
#### Getting token of each word in each sentence

In [295]:
def word_tokens(sentence):
    sent_tok=[]
    for word in sentence:
        try:
            sent_tok.append(train_vocab[word])

        except:
            sent_tok.append(7085) # for Unkown words
    return sent_tok

In [259]:

sentence_tok=[]
sentence_tok=[word_tokens(sentence) for sentence in X_train_processed];

In [260]:
word_tokens(['dfjkdbv'])
# Now, this return 7086, because mapped any unknown word with token number 7086

[7085]

In [261]:
sentence_tok[0], sentence_tok[2]

([204, 20, 118, 360, 466, 6, 714, 17, 1183, 77, 19, 360, 596, 354, 49],
 [30, 459, 1, 1, 139, 102, 872])

In [262]:
#### Padding and converting token data to torch

In [263]:
text_list=[]
for sent_tok in sentence_tok:
    processed_text = torch.tensor(sent_tok, dtype=torch.int64)
    text_list.append(processed_text)

#padding
text_list = pad_sequence(text_list, batch_first=True, padding_value=7086)

In [264]:
text_list.shape

torch.Size([4736, 176])

In [265]:
text_list[0].shape
# 176 is the lenght of the sentences

torch.Size([176])

In [266]:
text_list.shape

torch.Size([4736, 176])

In [267]:
#### Creating DataLoader

In [268]:
label_list = torch.tensor(Y_train, dtype=torch.float32)

In [269]:
train_embedding_dataset = list(zip(text_list,label_list))

In [270]:
train_embedding_dataloader = DataLoader(train_embedding_dataset, batch_size=64)

In [271]:
len(train_embedding_dataloader)

74

In [272]:
weights_matrix_tensor = torch.tensor(weight_matrix, dtype=torch.float32)

In [273]:
weights_matrix_tensor.shape

torch.Size([7087, 300])

In [274]:
# Before, we move forward, we need to understand the concept of embedding bag
#  EmbeddingBag = Embedding + mean/mode operation to reduce the dimentionality
# or
# we can say to convert the embeddings of the words, in sentence, into single embedding representing the entire sentence

In [275]:
# train_vocab

In [279]:
embx = nn.EmbeddingBag(weight_matrix.shape[0], embed_dim, sparse=False, mode='mean')

In [280]:
embx

EmbeddingBag(7087, 300, mode='mean')

In [281]:
embx_mean = nn.EmbeddingBag.from_pretrained(embx.weight, mode='mean')

In [282]:
embx_mean(text_list)[0].shape

torch.Size([300])

In [283]:
hidden_size = 256
learning_rate = .01
num_epochs=10
n_total_steps = len(train_embedding_dataloader)

In [284]:
class TextClassificationEmbeddingModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, hidden_size):
        super(TextClassificationEmbeddingModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False, mode='mean') ## this will create random values initially and then update the weights through back propagation
        self.embedding.from_pretrained(weights_matrix_tensor, freeze=False) # Freeze= False means these weights will be trained during the training phase
        self.l1 = nn.Linear(embed_dim, hidden_size)
        self.relu1 = nn.ReLU()
        self.l2 = nn.Linear(hidden_size,1)

    def forward(self,x):
        embedded = self.embedding(x)
        out = self.l1(embedded)
        out = self.relu1(out)
        out = self.l2(out)
        y_pred = torch.sigmoid(out)
        return y_pred

In [285]:
mymodel_embedding = TextClassificationEmbeddingModel(weight_matrix.shape[0], embed_dim,hidden_size)

In [286]:
mymodel_embedding

TextClassificationEmbeddingModel(
  (embedding): EmbeddingBag(7087, 300, mode='mean')
  (l1): Linear(in_features=300, out_features=256, bias=True)
  (relu1): ReLU()
  (l2): Linear(in_features=256, out_features=1, bias=True)
)

In [287]:
criterion = nn.BCELoss() ## Binary Cross Entropy Loss
optimizer = torch.optim.Adam(mymodel_embedding.parameters(),lr = learning_rate)

In [288]:
count=0
#writer = SummaryWriter()
for epoch in range(num_epochs):
    myloss = []
    for i,(x_data, labels) in enumerate(train_embedding_dataloader):
        count+=1
        y_predicted = mymodel_embedding(x_data) ## forward propagation
        #print(y_predicted.shape)
        #print(labels.shape)
        loss = criterion(y_predicted,labels) ## loss calculation

        optimizer.zero_grad() ## Empty the gradients

        loss.backward() ## backward propagation

        optimizer.step() ## update the weights

        myloss.append(round(loss.item(),5))
        #if (i+1)%10 == 0:
        #    print(f'{epoch+1} of {num_epochs} ; step: {i+1} of {n_total_steps} ; loss = {loss.item():.4f}')
    print(f'Epoch {epoch+1} of {num_epochs}; Mean_Loss =  {round(np.mean(myloss),8)}')
    #writer.add_scalar("Loss/train", np.mean(myloss), epoch)
#writer.flush()
#writer.close()

Epoch 1 of 10; Mean_Loss =  0.44118054
Epoch 2 of 10; Mean_Loss =  0.23299324
Epoch 3 of 10; Mean_Loss =  0.05224716
Epoch 4 of 10; Mean_Loss =  0.03080811
Epoch 5 of 10; Mean_Loss =  0.02038514
Epoch 6 of 10; Mean_Loss =  0.01253838
Epoch 7 of 10; Mean_Loss =  0.01275216
Epoch 8 of 10; Mean_Loss =  0.01057135
Epoch 9 of 10; Mean_Loss =  0.01474311
Epoch 10 of 10; Mean_Loss =  0.02863311


In [289]:
mymodel_embedding.embedding.weight.shape

torch.Size([7087, 300])

In [290]:
weights_matrix_tensor.shape

torch.Size([7087, 300])

In [293]:
# Embedding of the word connected has been updated / trained
mymodel_embedding.embedding.weight[7084][0:10]

tensor([ 1.5408, -0.0513, -0.8642, -0.5764,  0.2069,  0.6759, -2.5197, -0.8774,
        -1.0227, -1.8037], grad_fn=<SliceBackward0>)

In [294]:
weights_matrix_tensor[7084][0:10]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
# Let's see how to use for test data

In [330]:
X_test_processed = simple_prep_process(X_test)

In [331]:
def word_tokens(sentence):
    sent_tok=[]
    for word in sentence:
        try:
            sent_tok.append(train_vocab[word])

        except:
            sent_tok.append(7085) # for Unkown words
    return sent_tok

sentence_test_tok=[]
sentence_test_tok=[word_tokens(sentence) for sentence in X_test_processed];

In [332]:
sentence_test_tok[0]

[1, 52, 398, 4101]

In [333]:
text_test_list=[]
for sent_tok in sentence_test_tok:
    processed_text = torch.tensor(sent_tok, dtype=torch.int64)
    text_test_list.append(processed_text)

#padding
text_test_list = pad_sequence(text_test_list, batch_first=True, padding_value=7086)

In [334]:
text_test_list[45]

tensor([7085, 1224, 7085,  335,  567,   69,  205,  597, 5257,  224,  169,   22,
          11,    1,  483,   27,  567, 1095, 7085,   26,  134, 5257,  224,  169,
         275, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086, 7086,
        7086, 7086, 7086, 7086, 7086, 70

In [335]:

mymodel_embedding(torch.unsqueeze(text_test_list[45],0))

tensor([[0.9999]], grad_fn=<SigmoidBackward0>)

In [339]:
Y_test[45]

array([1])

In [338]:
mymodel_embedding(torch.unsqueeze(text_test_list[0],0))

tensor([[0.0206]], grad_fn=<SigmoidBackward0>)

In [340]:
Y_test[0]

array([0])