In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast, TFBertModel, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split


if torch.cuda.is_available():
    device = torch.device("cuda")  # Use GPU
else:
    device = torch.device("cpu")   # Use CPU

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/archive/train.csv',encoding='iso-8859-1')[['text', 'sentiment']];

In [None]:
train_df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


# Data Processing

In [None]:
import re
def preprocessing(text):
  regex = r'[^\w\s]|[\U0001f600-\U0001f64f\U0001f300-\U0001f5ff\U0001f680-\U0001f6ff\U0001f1e0-\U0001f1ff]'
  text=re.sub(regex," ",text)
  text=re.sub("\.|\,|\/|\-"," ",text)
  text=re.sub("\s*\s"," ",text)
  return text
for i in range(len(train_df)):
  train_df.loc[i,"text"]=preprocessing(str(train_df.loc[i,"text"]))

In [None]:
train_df.dropna(subset=['text'], inplace=True)
train_df["sentiment"]=train_df["sentiment"].replace({"neutral":0,"positive":1,"negative":2})

# Class distribution for the train set
print("train_df ", train_df['sentiment'].value_counts())

Train_data  sentiment
0    11118
1     8582
2     7781
Name: count, dtype: int64


In [None]:
# split train dataset into train, validation and test sets
train_text, val_text, train_labels, val_labels = train_test_split(train_df['text'], train_df['sentiment'],
                                                                    random_state=2024,
                                                                    test_size=0.15,
                                                                    stratify=train_df['sentiment'])

# Tokenization

In [None]:
# Max length of text in train texts
max_seq_train_len = max([len(i.split()) for i in train_text])
max_seq_val_len = max([len(i.split()) for i in val_text])
max_seq_len = max(max_seq_train_len, max_seq_val_len)

In [None]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')
# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Convert Integer Sequences to Tensors

In [None]:
# for train set
train_tensorData = TensorDataset(torch.tensor(tokens_train['input_ids']),
                                 torch.tensor(tokens_train['attention_mask']),
                                 torch.tensor(train_labels.tolist()))
# for validation set
val_tensorData = TensorDataset(torch.tensor(tokens_val['input_ids']),
                               torch.tensor(tokens_val['attention_mask']),
                               torch.tensor(val_labels.tolist()))


# DataLoaders

In [None]:
batch_size = 32
# sampler for sampling the data during training
train_sampler = RandomSampler(train_tensorData)
val_sampler = SequentialSampler(val_tensorData)
# dataLoader for train set and validation set
train_dfloader = DataLoader(train_tensorData, sampler=train_sampler, batch_size=batch_size)
val_dataloader = DataLoader(val_tensorData, sampler=val_sampler, batch_size=batch_size)


# Model Architecture

In [None]:
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--3 for pos/neu/neg classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
model = model.to(device) # push the model to GPU
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr = 1e-5) #Optimizer
class_wts = compute_class_weight(class_weight = "balanced", #Class weights
                                 classes= np.unique(train_labels), y= train_labels)
weights= torch.tensor(class_wts,dtype=torch.float) # convert class weights to tensor
weights = weights.to(device) # push to GPU
cross_entropy  = nn.NLLLoss(weight=weights) # loss function
epochs = 10 #Number of training epochs



# Fine-tuning

In [None]:
# function to train the model
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds=[]
    for step,batch in enumerate(train_dfloader): # iterate over batches
      if step % 100 == 0 and not step == 0: # progress update after every 50 batches.
        print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dfloader)))
      batch = [r.to(device) for r in batch]
      sent_id, mask, labels = batch # pull the inputs from our dataloader
      model.zero_grad()   # clear previously calculated gradients
      output = model(sent_id,
                      token_type_ids=None,
                      attention_mask=mask,
                      labels=labels)
      loss, logits = output.loss, output.logits
      total_loss += loss.item()
      preds = logits
      loss.backward() # backward pass to calculate the gradients
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step() # update parameters
      preds=preds.detach().cpu().numpy()
      total_preds.append(preds)
    avg_loss = total_loss / len(train_dfloader)
    total_preds  = np.concatenate(total_preds, axis=0)
    return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
  model.eval()
  total_loss, total_accuracy = 0, 0
  total_preds = [] # empty list to save the model predictions
  for step,batch in enumerate(val_dataloader): # iterate over batches
    if step % 100 == 0 and not step == 0: # Progress update every 50 batches.
      print('Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
    batch = [t.to(device) for t in batch]
    sent_id, mask, labels = batch
    with torch.no_grad():
      output = model(sent_id,
                      token_type_ids=None,
                      attention_mask=mask,
                      labels=labels)
      loss, logits = output.loss, output.logits
      preds = logits
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
      total_preds.append(preds)
  avg_loss = total_loss / len(val_dataloader)
  total_preds  = np.concatenate(total_preds, axis=0)
  return avg_loss, total_preds

# Model Training

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')
# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss, _ = train() #train model
    valid_loss, _ = evaluate()  #evaluate model
    if valid_loss < best_valid_loss: #save the best model
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch   100  of    730.
  Batch   200  of    730.
  Batch   300  of    730.
  Batch   400  of    730.
  Batch   500  of    730.
  Batch   600  of    730.
  Batch   700  of    730.
Batch   100  of    129.

Training Loss: 0.658
Validation Loss: 0.529

 Epoch 2 / 10
  Batch   100  of    730.
  Batch   200  of    730.
  Batch   300  of    730.
  Batch   400  of    730.
  Batch   500  of    730.
  Batch   600  of    730.
  Batch   700  of    730.
Batch   100  of    129.

Training Loss: 0.481
Validation Loss: 0.514

 Epoch 3 / 10
  Batch   100  of    730.
  Batch   200  of    730.
  Batch   300  of    730.
  Batch   400  of    730.
  Batch   500  of    730.
  Batch   600  of    730.
  Batch   700  of    730.
Batch   100  of    129.

Training Loss: 0.394
Validation Loss: 0.585

 Epoch 4 / 10
  Batch   100  of    730.
  Batch   200  of    730.
  Batch   300  of    730.
  Batch   400  of    730.
  Batch   500  of    730.
  Batch   600  of    730.
  Batch   700  of    730.
Batch

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

# Predictions for Test Data

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/archive/test.csv',encoding='iso-8859-1')[['text', 'sentiment']];
for i in range(len(test_df)):
  test_df.loc[i,"text"]=preprocessing(str(test_df.loc[i,"text"]))
test_df.dropna(subset=['text'], inplace=True)
test_df["sentiment"]=test_df["sentiment"].replace({"neutral":0,"positive":1,"negative":2})
test_text = test_df['text']
test_labels = test_df['sentiment'].fillna(0).astype(np.int64)
# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist(), dtype=torch.float32)

# get predictions for test data
with torch.no_grad():
  op = model(test_seq.to(device), token_type_ids=None, attention_mask=test_mask.to(device), labels=test_y.to(device).long())
  loss, logits = op.loss, op.logits
  preds = logits
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)



In [None]:
# model's performance
target_names = ['positive', 'negative', 'neutral']
print(classification_report(test_y, preds, target_names=target_names))

              precision    recall  f1-score   support

    positive       0.85      0.89      0.87      2711
    negative       0.86      0.80      0.83      1103
     neutral       0.80      0.76      0.78      1001

    accuracy                           0.84      4815
   macro avg       0.84      0.82      0.83      4815
weighted avg       0.84      0.84      0.84      4815

