<a href="https://colab.research.google.com/github/shazzad-hasan/practice-deep-learning-with-pytorch/blob/main/text_classification/spam_vs_ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# upload kaggle API key from your local machine
from google.colab import files
files.upload()

In [None]:
# make a kaggle dir, copy the API key to it
# and make sure the file in only readable by yourself (chmod 600)
!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# use API command to download the dataset
!kaggle datasets download -d uciml/sms-spam-collection-dataset

In [None]:
# uncompress the dataset
!unzip -qq sms-spam-collection-dataset.zip

In [None]:
!pip install torchtext==0.9.1
!pip install torch==1.8.1

In [None]:
# import required libraries
import torch
import torchtext
from torchtext.legacy import data

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# check if cuda is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
  print("CUDA is not available")
else:
  print("CUDA is available")

device = torch.device('cuda') if train_on_gpu else torch.device('cpu')

### Load and Visualize the data

In [None]:
# read data from text file
sms_df = pd.read_csv("/content/spam.csv", encoding="latin-1")

sms_df.head()

### Data pre-processing

In [None]:
# drop 3 unnamed columns
sms_df = sms_df.drop(columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
# rename the columns and v2
sms_df = sms_df.rename(index = str, columns = {"v1": "labels", "v2": "text"})

sms_df.head()

In [None]:
# split the dataset
train_data, test_data = train_test_split(sms_df, test_size = 0.2, random_state = 42)
# reset index
train_data.reset_index(drop=True)
test_data.reset_index(drop=True)

In [None]:
print("Num of training data :", len(train_data))
print("Num of test data: ", len(test_data))

In [None]:
# save the training and test data to csv files
train_data.to_csv("train.csv", index=False)
test_data.to_csv("test.csv", index=False)

In [None]:
!ls

In [None]:
import nltk
nltk.download("punkt")

from nltk import word_tokenize

In [None]:
# specify how texts and labels should be processed
TEXT = data.Field(tokenize = word_tokenize)
LABEL = data.LabelField(dtype = torch.float)

In [None]:
# instantiate the filds objects
fields = [("labels", LABEL), ("text", TEXT)]

In [None]:
# specify row data for the data fields
train_data, test_data = data.dataset.TabularDataset.splits(path = '/content',
                                 train = "train.csv",
                                 test = "test.csv",
                                 format = "csv",
                                 skip_header = True,
                                 fields = fields)

In [None]:
print(f'Num of training data: {len(train_data)}')
print(f'Number of testing data: {len(test_data)}')

In [None]:
train_data[0].__dict__.keys()

In [None]:
test_data[0].text

In [None]:
train_data[0].labels

In [None]:
# print out all attributes associated with training
print(vars(train_data.examples[5]))

In [None]:
# limit the vocabulary in the training data
vocab_size = 10000

# numericalize 
TEXT.build_vocab(train_data, max_size = vocab_size)
LABEL.build_vocab(train_data)

In [None]:
print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

In [None]:
# most common 20 words
TEXT.vocab.freqs.most_common(20)

In [None]:
# indices to tokens
TEXT.vocab.itos[:10]

In [None]:
# numeric representation of individual string
print(LABEL.vocab.stoi)

In [None]:
# define an iterator that batches examples of similar lengths together

batch_size = 64

train_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, test_data),
        batch_size = batch_size,
        device = device,
        sort_key = lambda x: len(x.text),
        sort_within_batch = False)

### Define a Model

In [None]:
import torch.nn as nn
import torch.optim as optim

class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()

    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, text):
    embedded = self.embedding(text)
    output, hidden = self.rnn(embedded)
    hidden_1D = hidden.squeeze(0)
    preds = self.fc(hidden_1D)
    return preds

class LSTM(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers):
    super(LSTM, self).__init__()

    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, text):
    embedded = self.embedding(text)
    output, (hidden,_) = self.lstm(embedded)
    hidden_1D = hidden.squeeze(0)
    preds = self.fc(hidden_1D)
    return preds

In [None]:
num_embeddings  = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1
num_layers = 1

RNN_model = RNN(num_embeddings , embedding_dim, hidden_dim, output_dim)
RNN_model.to(device)

LSTM_model = LSTM(num_embeddings, embedding_dim, hidden_dim, output_dim, num_layers)
LSTM_model.to(device)

### Train the model

In [None]:
def train(model, iterator, optimizer, criterion):  

  train_loss = 0.0

  model.train()
  for batch_idx, batch in enumerate(train_iterator):
    optimizer.zero_grad()
    predict = model(batch.text)
    targets = (batch.labels).unsqueeze(1)
    loss = criterion(predict, targets)
    loss.backward()
    optimizer.step()
    train_loss += loss.item()

  train_loss /= len(train_iterator)

  return train_loss

In [None]:
optimizer = optim.Adam(RNN_model.parameters(), lr = 1e-6)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 5
train_losses = []

for epoch in range(num_epochs):
  train_loss = train(RNN_model, train_iterator, optimizer, criterion)

  print('Epoch: {} | Training Loss: {:.2f}'.format(epoch, train_loss)) 

  train_losses.append(train_loss)

In [None]:
optimizer = optim.Adam(RNN_model.parameters(), lr = 1e-6)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 5
train_losses = []

for epoch in range(num_epochs):
  train_loss = train(LSTM_model, train_iterator, optimizer, criterion)

  print('Epoch: {} | Training Loss: {:.2f}'.format(epoch, train_loss)) 

  train_losses.append(train_loss)

### Test the model