In [2]:
# !pip install torchtext==0.6.0
# !pip install torch


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy  as np
import torch
import torchtext as td
import nltk
from torchtext.data import Field , BucketIterator, TabularDataset , LabelField
from nltk import word_tokenize
import torch.nn as nn

In [4]:
class RNN(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):

        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim)

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):

        embedded = self.embedding(text)

        output, hidden = self.rnn(embedded)

        hidden_1D = hidden.squeeze(0)

        assert torch.equal(output[-1, :, :], hidden_1D)

        return self.fc(hidden_1D)

In [5]:
class RNN1(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):

        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim, hidden_dim)

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):

        embedded = self.embedding(text)

        output, (hidden, _) = self.rnn(embedded)

        hidden_1D = hidden.squeeze(0)

        assert torch.equal(output[-1, :, :], hidden_1D)

        return self.fc(hidden_1D)

In [6]:
class RNN2(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):

        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim, hidden_dim)

        self.fc = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(0.3)

    def forward(self, text):

        embedded = self.embedding(text)

        embedded_dropout = self.dropout(embedded)

        output, (hidden, _) = self.rnn(embedded_dropout)

        hidden_1D = hidden.squeeze(0)

        assert torch.equal(output[-1, :, :], hidden_1D)

        return self.fc(hidden_1D)

In [7]:
data = pd.read_csv(r'/content/spam.csv' , encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
data = data.drop(columns = ['Unnamed: 2' , 'Unnamed: 3','Unnamed: 4'] , axis = 1)

In [9]:
data = data.rename(index = str , columns = {'v1' : 'labels' , 'v2' : 'text'})
data.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
train , test = train_test_split(data , test_size = .2 , random_state = 42)

In [11]:
train.reset_index(drop=True) , test.reset_index(drop=True)

(     labels                                               text
 0       ham  No I'm in the same boat. Still here at my moms...
 1      spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
 2       ham     They r giving a second chance to rahul dengra.
 3       ham     O i played smash bros  &lt;#&gt;  religiously.
 4      spam  PRIVATE! Your 2003 Account Statement for 07973...
 ...     ...                                                ...
 4452    ham  I came hostel. I m going to sleep. Plz call me...
 4453    ham                             Sorry, I'll call later
 4454    ham      Prabha..i'm soryda..realy..frm heart i'm sory
 4455    ham                         Nt joking seriously i told
 4456    ham                In work now. Going have in few min.
 
 [4457 rows x 2 columns],
      labels                                               text
 0       ham  Funny fact Nobody teaches volcanoes 2 erupt, t...
 1       ham  I sent my scores to sophas and i had to do sec...
 2      spam

In [12]:
train.shape , test.shape

((4457, 2), (1115, 2))

In [13]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
TEXT = Field(tokenize = word_tokenize)

In [15]:
LABEL = LabelField(dtype = torch.float)

In [16]:
datafields = [('labels' , LABEL) , ('text' , TEXT)]

In [17]:
trn , tst = TabularDataset.splits(path ='/content/spam.csv',
                                  format='csv',
                                  train = '/content/train.csv' ,
                                  test = '/content/test.csv' ,
                                  skip_header = True ,
                                  fields = datafields)

In [18]:
print(trn.fields , tst.fields)

{'labels': <torchtext.data.field.LabelField object at 0x793be52468f0>, 'text': <torchtext.data.field.Field object at 0x793be5246230>} {'labels': <torchtext.data.field.LabelField object at 0x793be52468f0>, 'text': <torchtext.data.field.Field object at 0x793be5246230>}


In [19]:
trn[:5]

[<torchtext.data.example.Example at 0x793be52479a0>,
 <torchtext.data.example.Example at 0x793be5247970>,
 <torchtext.data.example.Example at 0x793be5247940>,
 <torchtext.data.example.Example at 0x793be52478e0>,
 <torchtext.data.example.Example at 0x793be52469e0>]

In [20]:
print(f'number of trianing : {len(trn)}')
print(f'number of trianing : {len(tst)}')

number of trianing : 4457
number of trianing : 1115


In [21]:
trn[5].__dict__.keys()

dict_keys(['labels', 'text'])

In [22]:
trn[5].text

['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']

In [23]:
trn[5].labels

'ham'

In [24]:
# all the features that content the example
print(vars(trn.examples[5]))

{'labels': 'ham', 'text': ['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']}


In [25]:
# Limit the size of our features vectors to 10500 - this will represent
# every word in one-hot encoded from using just the top 10500 words of our vocabulary
TEXT.build_vocab(trn , max_size = 10500)

In [26]:
LABEL.build_vocab(trn)

In [27]:
print(f"Unique tokens in TEXT vocabulary : {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary : {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary : 10206
Unique tokens in LABEL vocabulary : 2


In [28]:
# most common words freq
# the word with index in my dataset .
print(TEXT.vocab.freqs.most_common(50))

[('.', 3862), ('to', 1750), ('I', 1574), (',', 1468), ('you', 1462), ('?', 1256), ('!', 1134), ('a', 1068), ('the', 946), ('...', 923), ('&', 772), ('i', 760), ('and', 673), ('in', 663), ('is', 647), (';', 641), ('u', 636), ('me', 600), (':', 570), ('..', 544), ('for', 527), ('my', 494), ('of', 471), ('it', 470), ('your', 461), ('have', 395), ('on', 394), (')', 393), ('2', 390), ('that', 385), ("'s", 384), ('now', 321), ("'m", 320), ('are', 316), ('do', 312), ('call', 307), ('at', 301), ('U', 300), ('or', 298), ('not', 295), ("n't", 281), ('be', 275), ('*', 270), ('lt', 267), ('gt', 267), ('with', 267), ('get', 265), ('will', 264), ('so', 257), ('#', 245)]


In [29]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '.', 'to', 'I', ',', 'you', '?', '!', 'a']


In [30]:
print(LABEL.vocab.stoi)

defaultdict(None, {'ham': 0, 'spam': 1})


In [31]:
#Returns a batch of example , where each example is of a similar lenght , thus minimizing the padding per example
batch_size = 64
train_iter , test_iter = BucketIterator.splits(
  (trn , tst) ,
  batch_size = batch_size ,
  sort_key = lambda x : len(x.text) ,
  sort_within_batch = False
)

In [48]:
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

model = RNN2(input_dim , embedding_dim,hidden_dim,output_dim)

In [49]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters() , lr = 1e-6)

In [50]:
criterion = nn.BCEWithLogitsLoss()

In [51]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()

        predictions = model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.labels)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float()

        acc = correct.sum() / len(correct)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [52]:
num_epochs = 8

for epoch in range(num_epochs):

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)

    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')

| Epoch: 01 | Train Loss: 0.697 | Train Acc: 39.45% 
| Epoch: 02 | Train Loss: 0.686 | Train Acc: 69.37% 
| Epoch: 03 | Train Loss: 0.675 | Train Acc: 83.73% 
| Epoch: 04 | Train Loss: 0.663 | Train Acc: 85.84% 
| Epoch: 05 | Train Loss: 0.653 | Train Acc: 86.04% 
| Epoch: 06 | Train Loss: 0.642 | Train Acc: 86.01% 
| Epoch: 07 | Train Loss: 0.632 | Train Acc: 85.91% 
| Epoch: 08 | Train Loss: 0.622 | Train Acc: 85.95% 


In [53]:
epoch_loss = 0
epoch_acc = 0
model.eval()

RNN2(
  (embedding): Embedding(10206, 100)
  (rnn): LSTM(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [54]:
with torch.no_grad():

    for batch in test_iter:

        predictions = model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.labels)

        rounded_preds = torch.round(torch.sigmoid(predictions))

        correct = (rounded_preds == batch.labels).float()
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iter)
test_acc  = epoch_acc / len(test_iter)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.664 | Test Acc: 75.17% |
