In [2]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification,AdamW
import torch
from torch.utils.data import DataLoader
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
import tqdm
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [77]:
!nvidia-smi

Mon Jan  4 12:46:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.130                Driver Version: 384.130                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Graphics Device     Off  | 00000000:41:00.0 Off |                    0 |
| N/A   36C    P0    36W / 250W |  16522MiB / 32502MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Graphics Device     Off  | 00000000:C1:00.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |     10MiB / 32502MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [3]:
def accuracy(out_logits,labels):
    predicted = out_logits.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    predicted = np.argmax(predicted,axis=1).reshape(labels.shape)
    return np.mean(labels == predicted)

def val_params(model,val_loader):
    temp = model.eval()
    num_batches = 0
    loss_sum = 0
    accuracy_sum =0
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss_sum += outputs.loss.item()
        accuracy_sum += accuracy(outputs.logits,labels)
        num_batches +=1
    return loss_sum/num_batches,accuracy_sum/num_batches

class EmpathyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
x = pickle.load(open("../data/x_train.p","rb"))
y = pickle.load(open("../data/y_train.p","rb"))

In [9]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15, random_state=42)
x_train, x_val1, y_train, y_val1 = train_test_split(x, y, test_size=0.6, random_state=42)

In [11]:
len(x_train)

15737

In [12]:
pd.set_option( "display.max_columns", None)
df=pd.DataFrame({
    'x':x_val,
    'y':y_val
})
df

Unnamed: 0,x,y
0,"Last week , my coworkers all got together and ...",1
1,He's out of John's hands . Nothing he can do a...,0
2,"Oh , I see . Then do you have anything less ex...",0
3,"I don't , but I couldn't find anything to eat ...",0
4,Have you anything to declare ?,0
...,...,...
5897,My dog pooped on the rug yesterday. I was so mad!,1
5898,I can see five other guys eying her up .,0
5899,I would have never dreamed of ever becoming a ...,0
5900,I asked a girl on a date and got turned down l...,1


In [13]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [13]:
tokenizer.save_pretrained("./model/BERT_empathy_fine_tuned/")

('./model/BERT_empathy_fine_tuned/tokenizer_config.json',
 './model/BERT_empathy_fine_tuned/special_tokens_map.json',
 './model/BERT_empathy_fine_tuned/vocab.txt',
 './model/BERT_empathy_fine_tuned/added_tokens.json')

In [14]:
train_encodings = tokenizer(x_train,truncation=True, padding='longest', return_tensors="pt")
val_encodings = tokenizer(x_val,truncation=True, padding='longest', return_tensors="pt")

In [15]:
train_dataset = EmpathyDataset(train_encodings,y_train)
val_dataset = EmpathyDataset(val_encodings,y_val)

In [16]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [17]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
x=model.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [18]:
optim = AdamW(model.parameters(), lr=5e-5)
weight = torch.Tensor([0.6,0.4]).to(device)
criterion = nn.CrossEntropyLoss(weight=weight)

In [19]:
EPOCHS=5
prev_val_acc = -1
for epoch in range(EPOCHS):
    temp = model.train()
    loss_sum = 0
    accuracy_sum = 0
    num_batch = 0
    pbar = tqdm.tqdm(train_loader)
    for batch in pbar:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits,labels)
        loss.backward()
        optim.step()
        loss_sum += loss.item()
        accuracy_sum += accuracy(outputs.logits,labels)
        num_batch+=1
        pbar.set_description("Epoch: %s, Train loss: %f, Train accuracy: %f"%(epoch,loss_sum/num_batch,accuracy_sum/num_batch))
    
    val_metric = val_params(model,val_loader)
    sys.stdout.write("         Val loss: %f, Val accuracy: %f"%val_metric)
    sys.stdout.flush()
    
    #Breaking criteria
    if prev_val_acc > val_metric[1]:
        break
    
    prev_val_acc = val_metric[1]
    
    #saving model checkpoint
    model.save_pretrained("../model/BERT_empathy_fine_tuned/")
    


Epoch: 0, Train loss: 0.106260, Train accuracy: 0.960493: 100%|██████████| 492/492 [02:40<00:00,  3.06it/s]


         Val loss: 0.066019, Val accuracy: 0.975676

Epoch: 1, Train loss: 0.034300, Train accuracy: 0.988694: 100%|██████████| 492/492 [02:39<00:00,  3.08it/s]


         Val loss: 0.083655, Val accuracy: 0.975507

In [63]:
temp = model.eval()
data = iter(val_loader).next()
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
labels = data['labels'].to(device)
output = model(input_ids, attention_mask=attention_mask)

In [64]:
pred_labels = torch.argmax(output.logits,dim=1).detach().cpu().numpy()
labels_numpy = labels.detach().cpu().numpy()

In [65]:
torch.argmax(output.logits,dim=1)

tensor([1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
        0, 0, 1, 1, 0, 0, 1, 1], device='cuda:0')

In [70]:
tokenizer.decode(input_ids[3],skip_special_tokens=True)

'well, i guess i exercise about two hours a day. i do aerobics three times a week, and the other days i play badminton 1 with my husband. i always feel good afterward.'

In [67]:
pd.DataFrame({
    'Predicted':pred_labels,
    'Target':labels_numpy
})

Unnamed: 0,Predicted,Target
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,1,1
6,1,1
7,1,1
8,1,1
9,0,0


## Checking custom sentences

In [20]:
texts = [
    'How are you',
    "I ate fish",
    "I am a boy",
    "Do you know me",
    "I went to market yesterday to purchase food",
    "Do I have a better chance to get this job"
]
tokens = tokenizer(texts,truncation=True, padding='longest', return_tensors="pt")
input_ids = tokens['input_ids'].to(device)
attention_mask = tokens["attention_mask"].to(device)
output = model(input_ids, attention_mask=attention_mask)
pred_labels = torch.argmax(output.logits,dim=1).detach().cpu().numpy()
df = pd.DataFrame({
    "text":texts,
    "labels":pred_labels
})
df

Unnamed: 0,text,labels
0,How are you,0
1,I ate fish,1
2,I am a boy,1
3,Do you know me,0
4,I went to market yesterday to purchase food,1
5,Do I have a better chance to get this job,1


## Simple model training

In [None]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_val = 

In [None]:
lr = LogisticRegression(max_iter=10000)
lr.fit(x_train,y_train)


In [7]:
model = DistilBertForSequenceClassification.from_pretrained("../model/BERT_empathy_fine_tuned/").to(device)

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained("../model/BERT_empathy_fine_tuned/")

In [26]:
x = pickle.load(open("../data/x_test.p","rb"))
y = pickle.load(open("../data/y_test.p","rb"))

In [12]:
encodings = tokenizer(x,truncation=True, padding='longest', return_tensors="pt")
empathy_dataset_test = EmpathyDataset(encodings,y)
test_data_loader = DataLoader(empathy_dataset_test,batch_size=BATCH_SIZE, shuffle=True)
x = model.eval()

In [13]:
x = val_params(model,test_data_loader)

In [14]:
x

(0.0536944548801137, 0.9830069124423964)

In [43]:
print("Test accuracy is ",x[1])

Test accuracy is  0.9830069124423964


In [29]:
l = [
    "I am a boy",
    "How are you?",
    "Do I have a better chance to get this job"
]
encodings = tokenizer(x[0:100],truncation=True, padding='longest', return_tensors="pt")

In [22]:
encodings

{'input_ids': tensor([[ 101, 1045, 2572, 1037, 2879,  102,    0,    0,    0,    0,    0,    0],
        [ 101, 2129, 2024, 2017, 1029,  102,    0,    0,    0,    0,    0,    0],
        [ 101, 2079, 1045, 2031, 1037, 2488, 3382, 2000, 2131, 2023, 3105,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [30]:
out = model(input_ids = encodings['input_ids'].to(device),attention_mask=encodings['attention_mask'].to(device))

RuntimeError: CUDA out of memory. Tried to allocate 94.00 MiB (GPU 0; 31.74 GiB total capacity; 30.68 GiB already allocated; 10.25 MiB free; 66.08 MiB cached)

In [24]:
out.logits

tensor([[-3.4387,  3.3785],
        [ 3.6995, -3.9435],
        [-2.8641,  2.8001]], device='cuda:0', grad_fn=<AddmmBackward>)

In [25]:
from sklearn.metrics import f1_score

In [None]:
def val_par(model,val_loader):
    temp = model.eval()
    num_batches = 0
    loss_sum = 0
    accuracy_sum =0
    predicted = []
    target = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        predicted = out_logits.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        predicted = np.argmax(predicted,axis=1).reshape(labels.shape)
    return loss_sum/num_batches,accuracy_sum/num_batches