In [None]:
!pip install -qU torch transformers datasets tqdm sentencepiece accelerate>=0.20.1 evaluate

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer,AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig
from datasets import load_metric
from tqdm import tqdm
from torch.utils.data import Dataset
import numpy as np
import torch.nn as nn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
models = ["nlptown/bert-base-multilingual-uncased-sentiment",
          "distilbert-base-uncased",
          "Seethal/sentiment_analysis_generic_dataset",
          "microsoft/deberta-large-mnli"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(models[2])

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/769 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
model.classifier

Linear(in_features=768, out_features=3, bias=True)

In [None]:
id2label = {0: 'fun', 1: 'hate', 2: 'love', 3: 'neutral', 4: 'sadness', 5: 'worry'}
label2id = {'fun':0, 'hate':1,'love':2, 'neutral':3, 'sadness':4, 'worry':5}
num_labels = 6

In [None]:
model.classifier = nn.Sequential(
    nn.Linear(model.config.hidden_size, num_labels)
)
model.config.num_labels = num_labels
model.config.id2label = id2label
model.config.label2id = label2id
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [None]:
model.classifier

Sequential(
  (0): Linear(in_features=768, out_features=6, bias=True)
)

In [None]:
class TweetDataset(Dataset):
    def __init__(self, input_ids, attention_masks, y_labels):
        super().__init__()
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.y_labels = y_labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        ans = {
              'input_ids': self.input_ids[index],
              'attention_mask': self.attention_masks[index],
              'labels': torch.tensor([self.y_labels[index]], dtype=torch.long)
          }
        return ans

In [None]:
!wget -O dataset.pt "https://drive.google.com/uc?export=download&id=16XKkPLD7_dakStvcggov7_ASU1oievZR&confirm=NEW_FILE_CONFIRM_CODE"
path = '/content/dataset.pt'
dataset = torch.load(path)

--2023-07-12 21:16:53--  https://drive.google.com/uc?export=download&id=16XKkPLD7_dakStvcggov7_ASU1oievZR&confirm=NEW_FILE_CONFIRM_CODE
Resolving drive.google.com (drive.google.com)... 74.125.200.100, 74.125.200.113, 74.125.200.102, ...
Connecting to drive.google.com (drive.google.com)|74.125.200.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-08-c8-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/t3r9ag90njmaassja75gv1op7542f3i8/1689196575000/12434615743218103933/*/16XKkPLD7_dakStvcggov7_ASU1oievZR?e=download&uuid=9a4d061f-2d65-45cf-a0d9-e6f8110bb648 [following]
--2023-07-12 21:17:02--  https://doc-08-c8-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/t3r9ag90njmaassja75gv1op7542f3i8/1689196575000/12434615743218103933/*/16XKkPLD7_dakStvcggov7_ASU1oievZR?e=download&uuid=9a4d061f-2d65-45cf-a0d9-e6f8110bb648
Resolving doc-08-c8-docs.googleusercontent.com (doc-08-c8-docs.googleusercontent

Lets see How model works and model's output

In [None]:
demo_input_id = torch.tensor([(dataset[0]['input_ids']).tolist()])
demo_attention_mask = torch.tensor([(dataset[0]['attention_mask']).tolist()])
expected_output = dataset[0]['labels']

In [None]:
output = model(input_ids=demo_input_id, attention_mask=demo_attention_mask).logits
output

tensor([[ 0.0244,  0.0287, -0.3931, -0.0682,  0.3070, -0.2140]],
       grad_fn=<AddmmBackward0>)

In [None]:
loss_function = nn.CrossEntropyLoss()
loss_function(output, expected_output)

tensor(1.7342, grad_fn=<NllLossBackward0>)

Training model

In [None]:
train_size = int(0.8 * len(dataset))

train_data, val_data = random_split(dataset, [train_size, len(dataset) - train_size])
learning_rate = 1e-5
num_epochs = 10
batch_size = 100
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [None]:
model.to('cuda')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
def validation_accuracy():
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in val_loader:
            batch_input_ids = batch["input_ids"].to('cuda')
            batch_attention_mask = batch["attention_mask"].to('cuda')
            batch_labels = batch["labels"].to('cuda')

            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)

            # Count the correct predictions
            correct_predictions += (predicted_labels == batch_labels.squeeze()).sum().item()
            total_predictions += batch_labels.size(0)

    accuracy = correct_predictions / total_predictions
    return accuracy

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        batch_input_ids = batch["input_ids"].to('cuda')
        batch_attention_mask = batch["attention_mask"].to('cuda')
        batch_labels = batch["labels"].to('cuda')
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask).logits
        loss = loss_function(outputs, batch_labels.squeeze())
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    val_loss = validation_accuracy()
    print(f"Epoch {epoch+1}: Average training Loss = {avg_loss} Average validation accuracy = {val_loss}")

Epoch 1: Average training Loss = 1.7702653259038925 Average validation accuracy = 0.23
Epoch 2: Average training Loss = 1.7075400650501251 Average validation accuracy = 0.215
Epoch 3: Average training Loss = 1.6691537946462631 Average validation accuracy = 0.255
Epoch 4: Average training Loss = 1.6353924125432968 Average validation accuracy = 0.24
Epoch 5: Average training Loss = 1.6086417436599731 Average validation accuracy = 0.245
Epoch 6: Average training Loss = 1.5637040734291077 Average validation accuracy = 0.255
Epoch 7: Average training Loss = 1.5345715284347534 Average validation accuracy = 0.265
Epoch 8: Average training Loss = 1.4984909743070602 Average validation accuracy = 0.255
Epoch 9: Average training Loss = 1.4603412300348282 Average validation accuracy = 0.28
Epoch 10: Average training Loss = 1.4270732253789902 Average validation accuracy = 0.28


In [None]:
model.to('cpu')
from transformers import pipeline
text = "I hate you, you are disgusting"
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier(text, top_k=3)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[{'label': 'hate', 'score': 0.4765914976596832},
 {'label': 'sadness', 'score': 0.2921445965766907},
 {'label': 'worry', 'score': 0.10015334188938141}]

In [None]:
path = '/content/drive/MyDrive/TweetData/SecondTryModel'
model.save_pretrained(path)
tokenizer.save_pretrained(path)

In [None]:
new_path = '/content/drive/MyDrive/TweetData/SecondTryModel'
model2 = AutoModelForSequenceClassification.from_pretrained(new_path)
tokenizer2 = AutoTokenizer.from_pretrained(new_path)

Some weights of the model checkpoint at /content/drive/MyDrive/TweetData/SecondTryModel were not used when initializing DistilBertForSequenceClassification: ['classifier.0.bias', 'classifier.0.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/TweetData/SecondTryModel and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to us

In [None]:
model2.to('cpu')
from transformers import pipeline
text = "I hate you, you are disgusting"

classifier = pipeline("sentiment-analysis", model=model2, tokenizer=tokenizer2)
classifier(text, top_k=3)

[{'label': 'love', 'score': 0.22705306112766266},
 {'label': 'neutral', 'score': 0.19721852242946625},
 {'label': 'sadness', 'score': 0.1901300549507141}]