# Test the model here [EmotiNet](Priyanshuchaudhary2425/EmotiNet)

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Extract and make a data path

In [None]:
from pathlib import Path
from zipfile import ZipFile

zip_file_path = "/content/drive/MyDrive/nlp-emotion.zip"

data_path = Path("data")

if data_path.exists():
  print(f"{data_path} already exist hence skipping!!")

else:
  data_path.mkdir(exist_ok=True, parents=True)

  with ZipFile(zip_file_path, "r") as file:
    file.extractall(data_path)


## 2. Check whats inside the data?

In [None]:
import pandas as pd

df = pd.read_csv("/content/data/text.csv")
df

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...,...
416804,416804,i feel like telling these horny devils to find...,2
416805,416805,i began to realize that when i was feeling agi...,3
416806,416806,i feel very curious be why previous early dawn...,5
416807,416807,i feel that becuase of the tyranical nature of...,3


In [None]:
# Drop the "Unnamed: 0" column from the DataFrame df
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...
416804,i feel like telling these horny devils to find...,2
416805,i began to realize that when i was feeling agi...,3
416806,i feel very curious be why previous early dawn...,5
416807,i feel that becuase of the tyranical nature of...,3


In [None]:
df.isnull().any()

text     False
label    False
dtype: bool

In [None]:
df.label.value_counts()

1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: label, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

# Define the size of the training and testing sets
train_size = 12_000
test_size = 2400

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=test_size, train_size=train_size, random_state=42)


In [None]:
train_data.label.value_counts()

1    4140
0    3391
3    1696
4    1376
2     985
5     412
Name: label, dtype: int64

In [None]:
test_data.label.value_counts()

1    810
0    679
3    325
4    283
2    219
5     84
Name: label, dtype: int64

In [None]:
# Exporting to csv so i can load easily throgh huggingface load dataset

train_data.to_csv("/content/data/train.csv", index=False)
test_data.to_csv("/content/data/test.csv", index=False)

In [None]:
# Last check!!
pd.read_csv("/content/data/train.csv")

Unnamed: 0,text,label
0,i am no longer red it feels weird,5
1,i feel also should be so irritated and annoyed...,3
2,i feel deeply saddened to see kenyans confront...,1
3,i mean mind you two whole rooms have been stri...,0
4,i had to go to paddington station to meet my w...,0
...,...,...
11995,i listen her songs i feel my splendid memories,1
11996,i was feeling pretty stressed out by the time ...,3
11997,i feel cranky and tired,3
11998,i do feel wonderfully accepted whenever i m in...,2


In [None]:
## Get the class list and dict secure!!
# Define the list of emotions
class_list = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

# Define the dictionary mapping class labels to emotions
class_dict = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}


## 3. Load the dataset using Huggingface datasets

In [None]:
# !pip install datasets

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [None]:
from datasets import load_dataset

train_test_data = {"train": "/content/data/train.csv", "test": "/content/data/test.csv"}

raw_datasets = load_dataset("csv", data_files=train_test_data)
raw_datasets

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2400
    })
})

In [None]:
raw_datasets["train"][100]

{'text': 'i will never do anything to physically harm another person but i feel my complaint has been completely ignored',
 'label': 0}

## 4. Processing the data

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["text"])
inputs.tokens()

['[CLS]', 'i', 'am', 'no', 'longer', 'red', 'it', 'feels', 'weird', '[SEP]']

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2400
    })
})

In [None]:
def tokenize_data(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding=True)

    # Extract the labels
    labels = examples["label"]

    return {"input_ids": tokenized_inputs["input_ids"], "attention_mask": tokenized_inputs["attention_mask"], "labels": labels}

# Tokenize the datasets
tokenized_datasets = raw_datasets.map(
    tokenize_data,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

## 5. Data collation

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][-i] for i in range(200)])
batch["labels"]

tensor([5, 3, 2, 3, 3, 1, 0, 1, 1, 1, 0, 0, 2, 3, 1, 1, 1, 1, 1, 1, 3, 3, 4, 0,
        1, 1, 1, 4, 0, 1, 3, 0, 1, 0, 3, 0, 4, 3, 1, 2, 4, 2, 2, 1, 4, 1, 1, 3,
        2, 4, 3, 1, 4, 1, 1, 0, 4, 0, 3, 0, 0, 1, 1, 2, 1, 1, 1, 0, 2, 1, 3, 1,
        0, 1, 1, 1, 1, 0, 1, 2, 1, 0, 0, 3, 0, 3, 0, 1, 1, 2, 0, 0, 0, 5, 1, 5,
        1, 0, 0, 3, 0, 0, 1, 0, 0, 3, 1, 3, 5, 3, 1, 0, 0, 0, 0, 3, 3, 0, 0, 0,
        0, 3, 1, 1, 1, 1, 4, 1, 3, 2, 1, 0, 2, 3, 0, 0, 1, 1, 5, 0, 1, 2, 2, 1,
        1, 0, 1, 0, 3, 3, 0, 0, 3, 1, 4, 4, 1, 0, 0, 1, 4, 4, 1, 2, 0, 2, 2, 1,
        0, 4, 1, 1, 1, 1, 0, 3, 0, 1, 4, 4, 1, 3, 0, 0, 1, 1, 1, 3, 0, 1, 1, 1,
        1, 4, 1, 2, 1, 1, 0, 3])

## 6. Metrics

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Convert label IDs to their corresponding string labels
    true_labels = [class_list[label] for label in labels]
    predicted_labels = [class_list[prediction] for prediction in predictions]

    # Calculate accuracy, precision, recall, and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


## 7. Defining the model

In [None]:
class_list

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [None]:
class_dict

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

In [None]:
id2label = {i: label for i, label in enumerate(class_list)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
id2label

{0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

In [None]:
label2id

{'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 8. Fine-tubing the model

In [None]:
# This is headache if you dont install here you have to restart the notebook
# !pip install transformers -U

In [None]:
# !pip install accelerate -U

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "EmotiNet",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1494,0.30357,0.92375,0.878347,0.916626,0.892122
2,0.1481,0.294424,0.924167,0.889259,0.886747,0.887684
3,0.072,0.322397,0.924167,0.882971,0.898994,0.890185


TrainOutput(global_step=4500, training_loss=0.13216405402289497, metrics={'train_runtime': 744.6124, 'train_samples_per_second': 48.347, 'train_steps_per_second': 6.043, 'total_flos': 1634878700484192.0, 'train_loss': 0.13216405402289497, 'epoch': 3.0})

In [None]:
trainer.push_to_hub(commit_message="Training complete")

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1708865067.fd5a8dd48fd7.312.2:   0%|          | 0.00/8.49k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Priyanshuchaudhary2425/EmotiNet/commit/a81b6873538e820b15761f048f9db9bb29b8e3c1', commit_message='Training complete', commit_description='', oid='a81b6873538e820b15761f048f9db9bb29b8e3c1', pr_url=None, pr_revision=None, pr_num=None)