# "Imdb Classification"
> "Classification of IMDB dataset reviews to either positive or negative"

- toc: false
- branch: master
- badges: true
- comments: true
- categories: [jupyter, pytorch, pytorch-lightning]
- hide: false
- search_exclude: true

In [1]:
%%capture
!pip install -U transformers[torch] pytorch-lightning

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import BertModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
path = '/content/drive/MyDrive/my-datasets/text-classification/IMDB.csv'
df = pd.read_csv(path)
df['sentiment'] = df['sentiment'].map(lambda x: float(x=='positive'))
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1.0
1,A wonderful little production. <br /><br />The...,1.0
2,I thought this was a wonderful way to spend ti...,1.0
3,Basically there's a family where a little boy ...,0.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0


In [6]:
PRETRAINED_MODEL = 'bert-base-uncased'
tokenizer = transformers.BertTokenizer.from_pretrained(PRETRAINED_MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [7]:
class ImdbDS(Dataset):
    def __init__(self, df, tokenizer, max_len):
        super().__init__()
        self.df = df
        self.review = df.review.to_numpy()
        self.sentiment = df.sentiment.to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, ix):
        text = str(self.review[ix])
        encoding = self.tokenizer.encode_plus(
            text,
            max_length = self.max_len,
            add_special_tokens = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
        return {
            'review_text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'target': torch.tensor(self.sentiment[ix], dtype=torch.long)
        }
        
    def __len__(self):
        return len(self.review)
     

class ImdbDM(pl.LightningDataModule):
    def __init__(self, df, tokenizer, max_len):
        super().__init__()
        self.train_df, self.val_df = train_test_split(df)
        self.val_df, self.test_df = train_test_split(
            self.val_df, 
            test_size=0.2
        )
        self.train_ds = ImdbDS(self.train_df, tokenizer, max_len)
        self.val_ds = ImdbDS(self.val_df, tokenizer, max_len)    
        self.test_ds = ImdbDS(self.test_df, tokenizer, max_len)
    
    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=16, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size=16)
    
    def test_dataloader(self):
        return DataLoader(self.test_ds, batch_size=16)

In [8]:
class textClassifier(pl.LightningModule):
    def __init__(self):
      super().__init__()
      self.bert = BertModel.from_pretrained(PRETRAINED_MODEL)
      for param in self.bert.parameters():
        param.requires_grad = False
      self.drop = nn.Dropout(0.2)
      self.linear = nn.Linear(self.bert.config.hidden_size, 2)
      self.softmax = nn.Softmax(dim=1)
      self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, input_ids, attention_mask):
      output = self.bert(
          input_ids = input_ids,
          attention_mask = attention_mask
      )
      output = self.drop(output.pooler_output)
      output = self.linear(output)
      return self.softmax(output)
    
    def unpack_batch(self, batch):
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      target = batch['target']
      return input_ids, attention_mask, target

    def training_step(self, batch, batch_idx):
      input_ids, attention_mask, y = self.unpack_batch(batch)
      y_hat = self(input_ids, attention_mask)
      loss = self.loss_fn(y_hat, y)
      matches = [torch.argmax(i) == j for i,j in zip(y_hat,y)]
      acc = sum(matches)/len(matches)
      self.log('acc', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
      return {'loss':loss, 'acc': acc}

    def validation_step(self, batch, batch_idx):
      input_ids, attention_mask, y = self.unpack_batch(batch)
      y_hat = self(input_ids, attention_mask)
      loss = self.loss_fn(y_hat, y)
      matches = [torch.argmax(i) == j for i,j in zip(y_hat,y)]
      val_acc = sum(matches)/len(matches)
      self.log('val_acc', val_acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
      return {'loss':loss, 'val_acc': val_acc}
        
    def configure_optimizers(self):
      return optim.Adam(self.parameters(), lr=1e-3)

    def predict(self, batch):
      input_ids, attention_mask, y = self.unpack_batch(batch)
      preds = self(input_ids, attention_mask)
      return torch.argmax(preds, dim=1)
    
    def evaluate(self, batch):
      input_ids, attention_mask, y = self.unpack_batch(batch)
      preds = self.predict(batch)
      matches = (preds == y)
      acc = sum(matches)/len(matches)
      return acc.item()

    def get_progress_bar_dict(self):
      tqdm_dict = super().get_progress_bar_dict()
      if 'v_num' in tqdm_dict:
          del tqdm_dict['v_num']
      return tqdm_dict

In [9]:
if __name__ == '__main__':
  net = textClassifier()
  MAX_LEN = 160
  dm = ImdbDM(df, tokenizer, MAX_LEN)
  trainer = pl.Trainer(max_epochs=5, gpus=[0])
  trainer.fit(net, dm)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | bert    | BertModel        | 109 M 
1 | drop    | Dropout          | 0     
2 | linear  | Linear           | 1.5 K 
3 | softmax | Softmax          | 0     
4 | loss_fn | CrossEntropyLoss | 0     
---------------------------------------------
1.5 K     Trainable params
109 M     Non-trainable params
109 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [11]:
data = next(iter(dm.test_dataloader()))
ys = data['target']
preds = net.predict(data)
print(preds, ys)

acc = net.evaluate(data)
print(acc)



tensor([1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1]) tensor([1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1])
0.8125
