<a href="https://colab.research.google.com/github/pooriaazami/deep_learning_class_notebooks/blob/main/11_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files

In [None]:
files.upload()

In [4]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d yasserh/imdb-movie-ratings-sentiment-analysis

Downloading imdb-movie-ratings-sentiment-analysis.zip to /content
 92% 19.0M/20.6M [00:01<00:00, 18.2MB/s]
100% 20.6M/20.6M [00:01<00:00, 10.9MB/s]


In [6]:
!unzip imdb-movie-ratings-sentiment-analysis.zip
!rm imdb-movie-ratings-sentiment-analysis.zip

Archive:  imdb-movie-ratings-sentiment-analysis.zip
  inflating: movie.csv               


In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_value_

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

In [28]:
DATASET_PATH = 'movie.csv'
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'

BATCH_SIZE = 8
SEQ_LENGTH = 512
LR = 1e-3
CLIP_VALUE = 100
LATENT_DIM = 64
EPOCHS = 10

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
class IMDBDataset(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

    self.__iterator_idx = -1

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x.iloc[idx], self.y.iloc[idx]

  def __iter__(self):
    self.__iterator_idx = -1
    return self

  def __next__(self):
    self.__iterator_idx += 1
    if self.__iterator_idx >= self.x.shape[0]:
      raise StopIteration
    else:
      # print(self.__iterator_idx)
      return self.x.iloc[self.__iterator_idx], self.y.iloc[self.__iterator_idx]

In [10]:
class TextClassifier(nn.Module):
  def __init__(self, latent_dim, vocab_size, num_classes):
    super().__init__()

    self.__latent_dim = latent_dim
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=latent_dim)
    self.rnn = nn.LSTM(input_size=latent_dim, hidden_size=latent_dim, batch_first=True)
    self.fc = nn.Linear(in_features=latent_dim, out_features=num_classes)

  def forward(self, x):
    x = self.embedding(x)

    batch_size = x.size()[0]
    h_0 = torch.zeros(1, batch_size, self.__latent_dim).to(DEVICE)
    c_0 = torch.zeros(1, batch_size, self.__latent_dim).to(DEVICE)
    x, _ = self.rnn(x, (h_0, c_0))
    x = x[:, -1, :]

    x = self.fc(x)
    return x

In [11]:
tokenizer = get_tokenizer('basic_english')

In [12]:
def text_iterator(dataset):
  for x, _ in dataset:
    yield tokenizer(x)

In [13]:
def transform_text(x):
  x = tokenizer(x)
  x = vocabulary(x)

  pad_length = SEQ_LENGTH - len(x)
  x = torch.tensor(x, dtype=torch.int64)
  x = F.pad(x, (0, pad_length), value=vocabulary[PAD_TOKEN])

  return x

def collate_fn(batch):
  text_list, label_list = [], []

  for text, label in batch:
    text = transform_text(text)

    text_list.append(text)
    label_list.append(label)

  text_list = torch.vstack(text_list)
  label_list = torch.tensor(label_list, dtype=torch.float32).unsqueeze(-1)

  return text_list, label_list

In [14]:
df = pd.read_csv(DATASET_PATH)

In [15]:
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [16]:
df = df.sample(frac=1) # Shuffling the dataset

X = df['text']
Y = df['label']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2)

In [17]:
train_dataset = IMDBDataset(x_train, y_train)
test_dataset = IMDBDataset(x_test, y_test)

In [18]:
vocabulary = build_vocab_from_iterator(text_iterator(train_dataset), specials=[UNK_TOKEN, PAD_TOKEN])
vocabulary.set_default_index(vocabulary[UNK_TOKEN])

In [19]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
validation_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [20]:
loss_function = nn.BCEWithLogitsLoss()

In [29]:
model = TextClassifier(latent_dim=LATENT_DIM, vocab_size=len(vocabulary), num_classes=1).to(DEVICE)

In [30]:
optimizer = optim.Adam(model.parameters(), lr=LR)

In [25]:
def test_model():
  total_loss = .0
  accuracy = .0
  with torch.no_grad():
    for x, y in validation_dataloader:
      x = x.to(DEVICE)
      y = y.to(DEVICE)

      preds = model(x)

      total_loss += loss.detach().cpu().item()
      accuracy += torch.sum((preds > 0) == y)

  accuracy = accuracy / len(test_dataset) * 100
  return total_loss, accuracy

In [31]:
for i in range(1, EPOCHS + 1):
  print(f'Epoch {i} / {EPOCHS}')
  total_loss = .0
  accuracy = .0
  for x, y in tqdm(train_dataloader):
    optimizer.zero_grad()

    x = x.to(DEVICE)
    y = y.to(DEVICE)

    preds = model(x)
    loss = loss_function(preds, y)
    loss.backward()

    clip_grad_value_(model.parameters(), CLIP_VALUE)
    optimizer.step()

    total_loss += loss.detach().cpu().item()
    accuracy += torch.sum((preds > 0) == y)

  accuracy = accuracy / len(train_dataset) * 100
  val_loss, val_accuracy = test_model()
  print(f'loss: {total_loss:.2f}, accuracy: {accuracy:.2f} % | val_loss: {val_loss:.2f}, val_accuracy: {val_accuracy:.2f} %')

Epoch 1 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 2773.21, accuracy: 50.48 % | val_loss: 708.84, val_accuracy: 50.65 %
Epoch 2 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 2742.66, accuracy: 51.96 % | val_loss: 713.52, val_accuracy: 51.06 %
Epoch 3 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 2668.50, accuracy: 53.40 % | val_loss: 665.32, val_accuracy: 51.33 %
Epoch 4 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 2364.23, accuracy: 66.08 % | val_loss: 609.14, val_accuracy: 77.11 %
Epoch 5 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 1650.68, accuracy: 82.69 % | val_loss: 170.96, val_accuracy: 82.03 %
Epoch 6 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 1244.40, accuracy: 87.88 % | val_loss: 190.48, val_accuracy: 82.84 %
Epoch 7 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 914.82, accuracy: 91.50 % | val_loss: 72.06, val_accuracy: 84.74 %
Epoch 8 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 745.38, accuracy: 93.56 % | val_loss: 85.57, val_accuracy: 85.81 %
Epoch 9 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 541.25, accuracy: 95.64 % | val_loss: 120.46, val_accuracy: 85.99 %
Epoch 10 / 10


  0%|          | 0/4000 [00:00<?, ?it/s]

loss: 396.95, accuracy: 96.86 % | val_loss: 13.37, val_accuracy: 85.95 %


In [45]:
sample_comment = "It was the best movie I have ever seen in my whole life"
text = transform_text(sample_comment).unsqueeze(0).to(DEVICE)
pred = model(text)

In [46]:
pred

tensor([[2.0903]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [47]:
1 / (1 + torch.exp(-pred))

tensor([[0.8900]], device='cuda:0', grad_fn=<MulBackward0>)

In [51]:
embedding_matrix = model.embedding._parameters['weight'].detach().cpu().numpy()
tokens = vocabulary.get_itos()

In [53]:
with open('embeddings.tsv', 'w') as file:
  for row in embedding_matrix:
    row = '\t'.join([str(x) for x in row])
    file.write(row + '\n')

In [54]:
with open('tokens.tsv', 'w') as file:
  for token in tokens:
    file.write(token + '\n')

In [55]:
!zip embeddings.zip embeddings.tsv tokens.tsv

  adding: embeddings.tsv (deflated 57%)
  adding: tokens.tsv (deflated 58%)


In [None]:
files.download('embeddings.zip') # You can visualize with: projector.tensorflow.org