<a href="https://colab.research.google.com/github/shazzad-hasan/practice-deep-learning-with-pytorch/blob/main/text_classification/spam_vs_ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# upload kaggle API key from your local machine
from google.colab import files
files.upload()

In [None]:
# make a kaggle dir, copy the API key to it
# and make sure the file in only readable by yourself (chmod 600)
!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# use API command to download the dataset
!kaggle datasets download -d uciml/sms-spam-collection-dataset

In [None]:
# uncompress the dataset
!unzip -qq sms-spam-collection-dataset.zip

In [None]:
!pip install torchtext==0.9.1
!pip install torch==1.8.1

In [None]:
import torch
import torchtext
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv("/content/spam.csv", encoding="latin-1")

data.head()

In [None]:
data = data.drop(columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(index = str, columns = {"v1": "labels", "v2": "text"})

data.head()

In [None]:
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

train.reset_index(drop=True), test.reset_index(drop=True)

In [None]:
train.shape, test.shape

In [None]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

In [None]:
!ls

In [None]:
import nltk
nltk.download("punkt")

from nltk import word_tokenize

In [None]:
TEXT = Field(tokenize = word_tokenize)

In [None]:
LABEL = Field(dtype = torch.float)

In [None]:
datafields = [("labels", LABEL), ("text", TEXT)]

In [None]:
trn, tst = TabularDataset.splits(path = './',
                                        train = "train.csv",
                                        test = "test.csv",
                                        format = "csv",
                                        skip_header = True,
                                        fields = datafields)

In [None]:
trn[:5]

In [None]:
print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

In [None]:
trn[5].__dict__.keys()

In [None]:
trn[5].text

In [None]:
trn[5].labels

In [None]:
print(vars(trn.examples[5]))

In [None]:
TEXT.build_vocab(trn, max_size = 10500)

In [None]:
LABEL.build_vocab(trn)

In [None]:
print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

In [None]:
print(TEXT.vocab.freqs.most_common(50))

In [None]:
print(TEXT.vocab.itos[:10])

In [None]:
print(LABEL.vocab.stoi)

In [None]:
batch_size = 64

train_iterator, test_iterator = BucketIterator.splits(
    (trn, tst),
    batch_size = batch_size,
    sort_key = lambda x: len(x.text),
    sort_within_batch = False)

In [None]:
 import torch.nn as nn

 class RNN(nn.Module):
   def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
     super().__init__()
     self.embedding = nn.Embedding(input_dim, embedding_dim)
     self.rnn = nn.RNN(embedding_dim, hidden_dim)
     self.fc = nn.Linear(hidden_dim, output_dim)

   def forward(self, text):
     embedded = self.embedding(text)
     output, hidden = self.rnn(embedded)
     hidden_1D = hidden.squeeze(0)
     assert torch.equal(output[-1, :, :], hidden_1D)
     return self.fc(hidden_1D)

In [None]:
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 1e-6)
criterion = nn.BCEWithLogitsLoss()

In [None]:
def train(model, iterator, optimizer, criterion):  
  epoch_loss = 0

  model.train()

  for batch_idx, batch in enumerate(iterator):
    texts = batch.text
    labels = batch.labels
    labels = labels.unsqueeze(1)
    optimizer.zero_grad()
    predictions = model(texts)
    loss = criterion(predictions, labels)
    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds==batch.labels).float()
    acc = correct.sum() / len(correct)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
num_epochs = 5

for epoch in range(num_epochs):

  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

  print(f"| Epoch: {epoch+1:02}  |Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:0.2f}%")

In [None]:
epoch_loss = 0

model.eval()
with torch.no_grad():
  for batch in test_iterator:
    predictions = model(batch.text)
    loss = criterion(predictions, (batch.labels).T)
    epoch_loss += loss.item()

test_loss = epoch_loss / len(test_iterator)

print(f'Test Loss: {test_loss:.3f}')

In [None]:
class LSTM(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(0.3)

  def forward(self, text):
    embedded = self.embedding(text)
    output, (hidden, _) = self.rnn(embedded)
    hidden_1D = hidden.squeeze(0)
    return self.fc(hidden_1D)

In [None]:
model = LSTM(input_dim, embedding_dim, hidden_dim, output_dim)

In [None]:
num_epochs = 5

for epoch in range(num_epochs):

  train_loss = train(model, train_iterator, optimizer, criterion)

  print(f'Epoch: {epoch+1:02}  Train Loss: {train_loss:.3f}')

In [None]:
epoch_loss = 0

model.eval()
with torch.no_grad():
  for batch in test_iterator:
    predictions = model(batch.text)
    loss = criterion(predictions, (batch.labels).T)
    epoch_loss += loss.item()

test_loss = epoch_loss / len(test_iterator)

print(f'Test Loss: {test_loss:.3f}')