In [1]:
!pip install transformers -q

In [2]:
import numpy as np
import pandas as pd
import transformers
import torch
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch import nn
import warnings
warnings.filterwarnings("ignore")

In [3]:
from transformers import BertModel, BertTokenizer

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
bert = BertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

#### Loading the dataset and pre-processing

In [6]:
data = pd.read_csv('/kaggle/input/emotion-dataset/training.csv')

In [7]:
data

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [8]:
data.columns

Index(['text', 'label'], dtype='object')

In [9]:
data.isnull().sum()

text     0
label    0
dtype: int64

In [10]:
data.shape

(16000, 2)

In [11]:
def NLP_cleaning(text):
    text_corpus = []
    for sent in tqdm(text, desc='Cleaning'):
        sent = re.sub('<[^>]*>', '', sent)
        sent = re.sub('[^a-zA-z0-9]', ' ', sent)
        sent = sent.lower()
        text_corpus.append(sent)

    return text_corpus

In [12]:
text = data.text.values.tolist()

In [13]:
text[0]

'i didnt feel humiliated'

In [14]:
text_corpus = NLP_cleaning(text)

Cleaning: 100%|██████████| 16000/16000 [00:00<00:00, 81841.87it/s]


In [15]:
text_corpus[0]

'i didnt feel humiliated'

In [16]:
data['text'] = text_corpus

In [17]:
data

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [18]:
val_df = pd.read_csv('/kaggle/input/emotion-dataset/test.csv')
val_df

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0
...,...,...
1995,i just keep feeling like someone is being unki...,3
1996,im feeling a little cranky negative after this...,3
1997,i feel that i am useful to my people and that ...,1
1998,im feeling more comfortable with derby i feel ...,1


In [19]:
val_df['text'] = NLP_cleaning(val_df.text.values.tolist())
val_df

Cleaning: 100%|██████████| 2000/2000 [00:00<00:00, 79070.68it/s]


Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0
...,...,...
1995,i just keep feeling like someone is being unki...,3
1996,im feeling a little cranky negative after this...,3
1997,i feel that i am useful to my people and that ...,1
1998,im feeling more comfortable with derby i feel ...,1


In [20]:
train_text, val_text, train_labels, val_labels = train_test_split(data.drop(['label'], axis = 1), data['label'],
                                                                    random_state=2018,
                                                                    test_size=0.2,
                                                                    stratify=data['label'])


test_text = val_df.drop(['label'], axis = 1)
test_labels = val_df['label']

In [21]:
test_text

Unnamed: 0,text
0,im feeling rather rotten so im not very ambiti...
1,im updating my blog because i feel shitty
2,i never make her separate from me because i do...
3,i left with my bouquet of red and yellow tulip...
4,i was feeling a little vain when i did this one
...,...
1995,i just keep feeling like someone is being unki...
1996,im feeling a little cranky negative after this...
1997,i feel that i am useful to my people and that ...
1998,im feeling more comfortable with derby i feel ...


In [22]:
train_text

Unnamed: 0,text
12102,i feel damn agitated during the speech
3382,i feel totally lame but i have no idea what to...
6383,i suppose it s partly my fault for forgetting ...
155,i could feel her whimper to the thought of bei...
15297,i was feeling pretty low and despite it being ...
...,...
15966,i still feel devastated and disconsolate
9347,i do however feel a bit envious of people who ...
6773,i feel angry at him for being so selfish and g...
7756,i feel passionate that students should have ch...


In [23]:
text = ["this is a bert model tutorial", "we will fine-tune a bert model"]
sent_id = tokenizer.batch_encode_plus(text, padding=True)
print(sent_id)

{'input_ids': [[101, 2023, 2003, 1037, 14324, 2944, 14924, 4818, 102, 0], [101, 2057, 2097, 2986, 1011, 8694, 1037, 14324, 2944, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [24]:
tokens_train = tokenizer.batch_encode_plus(
    train_text['text'].tolist(),
    max_length = 512,
    pad_to_max_length=True,
    truncation=True
)

tokens_val = tokenizer.batch_encode_plus(
    val_text['text'].tolist(),
    max_length = 512,
    pad_to_max_length=True,
    truncation=True
)

tokens_test = tokenizer.batch_encode_plus(
    test_text['text'].tolist(),
    max_length = 256,
    pad_to_max_length=True,
    truncation=True
)

In [25]:
len(pd.unique(data['label']))

6

In [26]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [27]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [28]:
for param in bert.parameters():
    param.requires_grad = False

In [29]:
import torch
import torch.nn as nn
from transformers import BertModel

class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 6)  # Adjust the number of output neurons for 123 classes
        self.log_softmax = nn.LogSoftmax(dim=1)  # Use dim=1 for LogSoftmax in classification

    def forward(self, sent_id, mask):
        sent_id = torch.tensor(sent_id)

        outputs = self.bert(input_ids=sent_id, attention_mask=mask)
        last_hidden_state_cls = outputs[0][:, 0, :]

        x = self.fc1(last_hidden_state_cls)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.log_softmax(x)

        return x

In [30]:
model = BERT_Arch(bert)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [31]:
device

'cuda'

In [32]:
from transformers import AdamW

optimizer = AdamW(model.parameters(),
                  lr = 1e-5)

In [33]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(train_labels)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_labels)
print("Class Weights:", class_weights)

Class Weights: [0.5714796  0.4972805  2.04538191 1.23528276 1.37634409 4.66812546]


In [34]:
weights= torch.tensor(class_weights,dtype=torch.float)
weights = weights.to(device)
cross_entropy  = nn.NLLLoss(weight=weights)

epochs = 10

In [35]:
def train():

    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds=[]

    for step,batch in enumerate(train_dataloader):

        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch
        sent_id = torch.tensor(sent_id)

        model.zero_grad()
        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss = total_loss + loss.item()
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        preds=preds.detach().cpu().numpy()
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [36]:
import time
def evaluate():

    print("\nEvaluating...")
    model.eval()
    total_loss, total_accuracy = 0, 0
    total_preds = []

    for step,batch in enumerate(val_dataloader):

        if step % 50 == 0 and not step == 0:
#             elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch
    sent_id = torch.tensor(sent_id)

    with torch.no_grad():

        preds = model(sent_id, mask)
        loss = cross_entropy(preds,labels)
        total_loss = total_loss + loss.item()
        preds = preds.detach().cpu().numpy()
        total_preds.append(preds)

    avg_loss = total_loss / len(val_dataloader)

    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [37]:
best_valid_loss = float('inf')

train_losses=[]
valid_losses=[]

for epoch in range(epochs):

    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    train_loss, _ = train()
    valid_loss, _ = evaluate()
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch    50  of    800.
  Batch   100  of    800.
  Batch   150  of    800.
  Batch   200  of    800.
  Batch   250  of    800.
  Batch   300  of    800.
  Batch   350  of    800.
  Batch   400  of    800.
  Batch   450  of    800.
  Batch   500  of    800.
  Batch   550  of    800.
  Batch   600  of    800.
  Batch   650  of    800.
  Batch   700  of    800.
  Batch   750  of    800.

Evaluating...
  Batch    50  of    200.
  Batch   100  of    200.
  Batch   150  of    200.

Training Loss: 0.890
Validation Loss: 0.000

 Epoch 2 / 10
  Batch    50  of    800.
  Batch   100  of    800.
  Batch   150  of    800.
  Batch   200  of    800.
  Batch   250  of    800.
  Batch   300  of    800.
  Batch   350  of    800.
  Batch   400  of    800.
  Batch   450  of    800.
  Batch   500  of    800.
  Batch   550  of    800.
  Batch   600  of    800.
  Batch   650  of    800.
  Batch   700  of    800.
  Batch   750  of    800.

Evaluating...
  Batch    50  of    200.
  Batch   1

In [40]:
print("Hello")

Hello


In [41]:
checkpoint = {'model': model,
              'state_dict': model.state_dict(),
              'optimizer' : optimizer.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')

In [42]:
input_text = input('Give an input text : ')
tokens = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

Give an input text :  i am so happy


In [43]:
encoded_review = tokenizer.encode_plus(
      input_text,
      max_length=256,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [44]:
prediction

tensor([1], device='cuda:0')

In [45]:
extracted_value = prediction.item()
print(extracted_value)

1


In [46]:
def Predict(text):
    encoded_review = tokenizer.encode_plus(
      text,
      max_length=256,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    extracted_value = prediction.item()
    return extracted_value

In [47]:
test_labels

0       0
1       0
2       0
3       1
4       0
       ..
1995    3
1996    3
1997    1
1998    1
1999    4
Name: label, Length: 2000, dtype: int64

In [48]:
test_text

Unnamed: 0,text
0,im feeling rather rotten so im not very ambiti...
1,im updating my blog because i feel shitty
2,i never make her separate from me because i do...
3,i left with my bouquet of red and yellow tulip...
4,i was feeling a little vain when i did this one
...,...
1995,i just keep feeling like someone is being unki...
1996,im feeling a little cranky negative after this...
1997,i feel that i am useful to my people and that ...
1998,im feeling more comfortable with derby i feel ...


In [49]:
y_pred = []
for text in test_text['text']:
    val = Predict(text)
    y_pred.append(val)

In [50]:
len(y_pred)

2000

In [71]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef
import numpy as np

true_labels = test_labels
predicted_labels = y_pred

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9860


In [72]:
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False
    model.eval()
    return model

In [73]:
model = load_checkpoint('checkpoint.pth')
print(model)

BERT_Arch(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=