In [1]:
import torch
import transformers
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from torch.utils.data import Dataset

In [2]:
import os
test = "E:/Academics/IIT Bombay/aclImdb/test/"
train = "E:/Academics/IIT Bombay/aclImdb/train/"

Test_list = []
Train_list = []

for i in ['pos','neg']:
    
    with os.scandir(test+i) as entries:
        files = []
        for entry in entries:
            if entry.is_file():
                files += [entry.name]
        for j in files:
            ID = j.split('_')[0]
            Label = (j.split('_')[1]).split('.')[0]
            try:
                with open(test+i+'/'+j,'r', encoding="utf8") as t:
                    Text = t.read()
                if(i=='pos'):
                    Test_list += [[ID, Text, Label, 1]]
                else:
                    Test_list += [[ID, Text, Label, 0]]
            except:
                continue
    
    with os.scandir(train+i) as entries:
        files = []
        for entry in entries:
            if entry.is_file():
                files += [entry.name]
        for j in files:
            ID = j.split('_')[0]
            Label = (j.split('_')[1]).split('.')[0]
            try:
                with open(train+i+'/'+j,'r', encoding="utf8") as t:
                    Text = t.read()
                if(i=='pos'):
                    Train_list += [[ID, Text, Label, 1]]
                else:
                    Train_list += [[ID, Text, Label, 0]] 
            except:
                continue

Train = pd.DataFrame(columns=['ID','Text','Label','Pos/Neg'],data = Train_list)
Test  = pd.DataFrame(columns=['ID','Text','Label','Pos/Neg'],data = Test_list)

In [3]:
from sklearn.model_selection import train_test_split   

x_train, x_val, y_train, y_val = train_test_split(Train[['Text']], Train[['Label','Pos/Neg']], train_size = 0.8)
x_test = Test[['Text']]
y_test = Test[['Label','Pos/Neg']]

x_train = list(x_train['Text'])
x_test  = list(x_test['Text'])
x_val   = list(x_val['Text'])

In [4]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased')

In [5]:
class MyDatatype(Dataset):
    
    def __init__(self, X_d, X_p, y_data):
        self.X_d = X_d
        self.X_p = X_p
        self.y_data = y_data
        
    def __getitem__(self, index):
        #(torch.Tensor(self.X_data[index]['input_ids']), torch.Tensor(self.X_data[index]['attention_mask']))
        return torch.Tensor(self.X_d[index]).to(torch.int64), torch.Tensor(self.X_p[index]).to(torch.int64), self.y_data[index]
        
    def __len__ (self):
        return len(self.X_d)


In [6]:
x_train = tokenizer(x_train,return_tensors='np', truncation=True, add_special_tokens=True, padding='max_length')
x_test  = tokenizer(x_test, return_tensors='np', truncation=True, add_special_tokens=True, padding='max_length')
x_val   = tokenizer(x_val, return_tensors='np', truncation=True, add_special_tokens=True, padding='max_length')

In [7]:
Train = MyDatatype(x_train['input_ids'], x_train['attention_mask'], torch.Tensor(np.float32(y_train.to_numpy())))
Test  = MyDatatype(x_test['input_ids'], x_test['attention_mask'], torch.Tensor(np.float32(y_train.to_numpy())))
Valid = MyDatatype(x_val['input_ids'], x_val['attention_mask'], torch.Tensor(np.float32(y_train.to_numpy())))

In [8]:
from transformers import DistilBertModel

In [9]:
from torch.utils.data import DataLoader

batch_size = 16
train_loader = DataLoader(dataset=Train, batch_size = batch_size,shuffle=True,drop_last=True)
val_loader   = DataLoader(dataset=Valid, batch_size = batch_size,shuffle=True,drop_last=True)

In [10]:
class Pooler(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = torch.nn.Tanh()

    def forward(self, hidden_states):
        first_token_tensor = hidden_states[:, 0]
        out = self.dense(first_token_tensor)
        out = self.activation(out)
        return out

In [11]:
from torch import nn

class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.base_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.bertpooler = Pooler(self.base_model.config)
        self.dropout = nn.Dropout2d(0.5)
        self.linear = nn.Linear(self.base_model.config.hidden_size,2)
        self.app1 = nn.Sigmoid()

    def forward(self, xids, xmask):
        x = self.base_model(input_ids = xids, attention_mask = xmask)
        x = self.bertpooler(x.last_hidden_state)
        x = self.dropout(x)
        x = self.linear(x)
        x = self.app1(x)
        return x

In [12]:
device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu')
model = NeuralNet().to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [14]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(y_pred)

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [15]:
from tqdm import tqdm
EPOCHS = 2
model = NeuralNet()
model.to(device)
print(model)
stats = []
for e in range(1, EPOCHS+1):
    model.train()
    stat = []
    epoch_loss = 0
    epoch_acc = 0
    counter = 0
    for X_inputs,X_attention, y_batch in tqdm(train_loader):
        X_inputs,X_attention, y_batch = X_inputs.to(device), X_attention.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_inputs, X_attention)
        loss = criterion(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        #if(counter%50)==0:
            #print(f"Done {counter} steps in epoch {e}")
    print(f'Epoch {e}: | Train_Loss: {epoch_loss/len(train_loader):.5f} | Train_Acc: {epoch_acc/len(train_loader):.3f}')
    stat += [e,epoch_loss/len(train_loader),epoch_acc/len(train_loader)]    
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    for X_inputs,X_attention, y_batch in train_loader:
        X_inputs,X_attention, y_batch = X_inputs.to(device), X_attention.to(device), y_batch.to(device)
        with torch.no_grad():
            y_pred = model(X_inputs, X_attention)
        loss = criterion(y_pred, y_batch)
        acc = binary_acc(y_pred, y_batch)
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    print(f'          | Valid_Loss: {epoch_loss/len(train_loader):.5f} | Valid_Acc: {epoch_acc/len(train_loader):.3f}')
    stat += [epoch_loss/len(train_loader),epoch_acc/len(train_loader)]
    stats += [stat]

Stats = pd.DataFrame(columns=['Epoch','Train_Loss','Train_Accuracy', 'Validation_Loss','Validation_Accuracy'],data=stats)
plt.plot(Stats['Epoch'], Stats['Train_Loss'],'r')
plt.plot(Stats['Epoch'], Stats['Validation_Loss'],'g')
plt.xlabel('Epochs')
plt.ylabel(f"Losses")
plt.legend(['Train Loss','Validation Loss'])
plt.show()
plt.plot(Stats['Epoch'], Stats['Train_Accuracy'],'r')
plt.plot(Stats['Epoch'], Stats['Validation_Accuracy'],'b')
plt.legend(['Train Accuracy','Validation Accuracy'])
plt.xlabel('Epochs')
plt.ylabel(f"Accuracy")
plt.show()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NeuralNet(
  (base_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_fea

  4%|▍         | 52/1250 [11:30<4:23:15, 13.18s/it]