In [1]:
import os

import pandas as pd
import numpy as np
import torch
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments, get_scheduler)

from torch.utils.data import DataLoader
from torch.optim import AdamW

from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
os.getcwd()

'/Users/raunakanand/Documents/Work_R/HF_transformers/Sentiment_Analysis/notebooks'

In [3]:
train = pd.read_csv('../data_source/train.csv', encoding='unicode_escape')
test = pd.read_csv('../data_source/test.csv', encoding='unicode_escape')

display(train.head(2))
display(test.head(2))

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105


Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0


In [4]:
sentiment_label = {
    'neutral': 1, 
    'negative': 0, 
    'positive': 2
}

In [5]:
# train has one missing item in text and selected text
# test has 1281 rows with all missing values
print(train.isnull().sum())
print(test.isnull().sum())

textID              0
text                1
selected_text       1
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64
textID              1281
text                1281
sentiment           1281
Time of Tweet       1281
Age of User         1281
Country             1281
Population -2020    1281
Land Area (Km²)     1281
Density (P/Km²)     1281
dtype: int64


In [6]:
train = train[['text', 'sentiment']].dropna()
train['sentiment'] = train['sentiment'].apply(lambda x: sentiment_label[x])

test = test[['text', 'sentiment']].dropna()
test['sentiment'] = test['sentiment'].apply(lambda x: sentiment_label[x])

In [7]:
## distribution of classes to predict for classification is uniform in train and test set
print(train['sentiment'].value_counts(normalize=True))
print(test['sentiment'].value_counts(normalize=True))

sentiment
1    0.404549
2    0.312300
0    0.283151
Name: proportion, dtype: float64
sentiment
1    0.404641
2    0.312111
0    0.283248
Name: proportion, dtype: float64


In [40]:
model_ckpt = 'distilbert/distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_ckpt, use_fast=True)


In [41]:
train_text, val_text, train_labels, val_labels = train_test_split(train['text'], train['sentiment'], test_size=0.2, random_state=42, shuffle=True)

In [42]:
def make_dataset(data_encodings):
    return [{k:v[i] for k,v in data_encodings.items()} for i in range(len(data_encodings['input_ids']))]

In [43]:
tokenized_train = tokenizer(train_text.tolist(), padding=True, truncation=True, return_tensors='pt')
tokenized_train['labels'] = torch.tensor(train_labels.to_list())
tokenized_train = make_dataset(tokenized_train)

tokenized_val = tokenizer(val_text.to_list(), padding=True, truncation=True, return_tensors='pt')
tokenized_val['labels'] = torch.tensor(val_labels.to_list())
tokenized_val = make_dataset(tokenized_val)

tokenized_test = tokenizer(test['text'].to_list(), padding=True, truncation=True, return_tensors='pt')
tokenized_test['labels'] = torch.tensor(test['sentiment'].to_list())
tokenized_test = make_dataset(tokenized_test)


In [44]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred,axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {'accuracy': accuracy, 'f1_score': f1}

In [45]:
def lod_to_dol(ls):
    out = {}
    for dict in ls:
        for k,v in dict.items():
            if(k in out):
                out[k].append(v)
            else:
                out[k] = [v]
            # print(out[k])
    return {k : torch.stack(v) for k,v in out.items()}

# lod_to_dol(batch)

### Initializing model and its parameters

In [66]:
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_ckpt, num_labels=3)
optimizer = AdamW(params=model.parameters(), lr=0.0001)
epochs = 1
batch_size = 16
num_training_steps = math.ceil(epochs * (len(tokenized_train)/batch_size))
print(num_training_steps)

lr_scheduler = get_scheduler(name='linear',
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

device = torch.device('mps')
print(device)

model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1374
mps


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

#### Training Code with evalution of model every 400 steps

In [67]:
progress_bar = tqdm(range(num_training_steps))
for epoch in range(epochs):
    for step in (range(num_training_steps)):
        # print(step)
        if(step % 400 == 0 and step!=0):
            model.eval()
            f1 = []
            accuracy = []
            num_val_steps = math.ceil(len(tokenized_val)/batch_size)

            for eval_step in tqdm(range(num_val_steps)):
                batch = tokenized_val[(batch_size * eval_step) : batch_size * (eval_step + 1)]
                batch = {k: v.to(device) for k, v in lod_to_dol(batch).items()}
                with torch.no_grad():
                    output = model(**batch)
                y_pred = torch.argmax(output.logits, dim=-1).to('cpu')
                accuracy.append(accuracy_score(y_pred=y_pred, y_true=batch['labels'].to('cpu')))
                f1.append(f1_score(y_pred=y_pred, y_true=batch['labels'].to('cpu'), average='weighted'))
            print("Accuracy_score : {}, \n f1score : {}". format(np.array(accuracy).mean(), np.array(f1).mean()))
            print(f"model loss : {output.loss}\n\n")
        
        model.train()
        batch = tokenized_train[(batch_size * step): batch_size * (step + 1)]
        # batch = [{k: v.to(device)} for x in batch for k,v in x.items()]
        # output = model(batch)
        batch = {k: v.to(device) for k,v in lod_to_dol(batch).items()}

        output = model(**batch)
        loss = output.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update()



  0%|          | 0/1374 [00:00<?, ?it/s]

  0%|          | 0/344 [00:00<?, ?it/s]

Accuracy_score : 0.7569040697674418, 
 f1score : 0.7543386108181137
model loss : 0.8792089223861694




  0%|          | 0/344 [00:00<?, ?it/s]

Accuracy_score : 0.7739825581395349, 
 f1score : 0.7731128488639178
model loss : 0.8709303140640259




  0%|          | 0/344 [00:00<?, ?it/s]

Accuracy_score : 0.7868822674418605, 
 f1score : 0.7868892248141676
model loss : 0.5186277627944946




### Performance of model on test data

In [68]:
model.eval()
f1 = []
accuracy = []
num_val_steps = math.ceil(len(tokenized_test)/batch_size)
for eval_step in tqdm(range(num_val_steps)):
    batch = tokenized_val[(batch_size * eval_step) : batch_size * (eval_step + 1)]
    batch = {k: v.to(device) for k, v in lod_to_dol(batch).items()}
    with torch.no_grad():
        output = model(**batch)
    y_pred = torch.argmax(output.logits, dim=-1).to('cpu')
    accuracy.append(accuracy_score(y_pred=y_pred, y_true=batch['labels'].to('cpu')))
    f1.append(f1_score(y_pred=y_pred, y_true=batch['labels'].to('cpu'), average='weighted'))
print("Accuracy_score : {}, \n f1score : {}". format(np.array(accuracy).mean(), np.array(f1).mean()))

  0%|          | 0/221 [00:00<?, ?it/s]

Accuracy_score : 0.7918552036199095, 
 f1score : 0.7917653036178728
