2501985451 - Jason Adriel

#0. Prerequisites

In [30]:
# Library Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

# Superglobals
GLOBAL_SEED = 2307
BATCH_SIZE = 128
LR = 1e-5

# Seeding
keras.utils.set_random_seed(GLOBAL_SEED)

# Setup
sns.set_style('whitegrid')

In [31]:
!pip install contractions
!pip install transformers[torch]
!pip install datasets
!pip install evaluate



In [32]:
!wget "https://media.githubusercontent.com/media/subtle64/Datasets/main/Others/Emotion.csv" -O "Emotion.csv"

--2024-01-23 03:14:15--  https://media.githubusercontent.com/media/subtle64/Datasets/main/Others/Emotion.csv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2069627 (2.0M) [text/plain]
Saving to: ‘Emotion.csv’


2024-01-23 03:14:15 (52.6 MB/s) - ‘Emotion.csv’ saved [2069627/2069627]



#1. Dataset Preprocessing

In [33]:
df = pd.read_csv("Emotion.csv")

In [34]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    20000 non-null  object
 1   label   20000 non-null  object
dtypes: object(2)
memory usage: 312.6+ KB


In [36]:
df.isna().sum()

text     0
label    0
dtype: int64

In [37]:
df.duplicated().sum()

1

In [38]:
df = df.drop_duplicates()

In [39]:
df['label'].value_counts()

joy         6760
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
Name: label, dtype: int64

In [40]:
import string
import nltk
import re
import contractions
from tqdm import tqdm
from transformers import AutoTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
nltk.download('wordnet')

# Split
def split(text):
  words = re.split('W+', text)
  return words

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-base-en")
def tokenize(text):
  return tokenizer(text, padding = True)

# Fixing contractions
def expand_contractions(words):
  return [contractions.fix(w) for w in words]

# Removing punctuations
def remove_noise(words):
  noise = set(string.punctuation + string.digits)
  return [w for w in words if not w in noise]

# Removing stopwords
def remove_stopwords(words):
  sw = stopwords.words('english')
  return [w for w in words if not w in sw]

# Lemmatization
def lemmatize(words):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(w) for w in words]

# Label encoding
def label_encode(y):
  enc = LabelEncoder()
  return enc.fit_transform(y), enc.classes_

# Remove nested array
def remove_nested(x):
  x['input_ids'] = x['input_ids'][0]
  x['token_type_ids'] = x['token_type_ids'][0]
  x['attention_mask'] = x['attention_mask'][0]
  return x


# Pipeline
def preprocess(df, x = 'text', y = 'label'):
  df = df.copy()
  tqdm.pandas()
  df[x] = df[x].progress_apply(split)
  df[x] = df[x].progress_apply(expand_contractions)
  df[x] = df[x].progress_apply(remove_noise)
  df[x] = df[x].progress_apply(remove_stopwords)
  df[x] = df[x].progress_apply(lemmatize)
  df[x] = df[x].progress_apply(tokenize)
  df = df[df[x] != ""]

  tokens = df[x].apply(pd.Series)
  df = pd.concat([df.drop([x], axis=1), tokens], axis=1)

  df = df.apply(remove_nested, axis = 1)

  df[y], classes = label_encode(df[y])

  return df, classes

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [41]:
df_clean, classes = preprocess(df)

100%|██████████| 19999/19999 [00:00<00:00, 57871.97it/s]
100%|██████████| 19999/19999 [00:00<00:00, 25464.54it/s]
100%|██████████| 19999/19999 [00:00<00:00, 165776.77it/s]
100%|██████████| 19999/19999 [00:02<00:00, 8179.83it/s]
100%|██████████| 19999/19999 [00:00<00:00, 169847.12it/s]
100%|██████████| 19999/19999 [00:06<00:00, 2919.10it/s]


In [42]:
df_clean

Unnamed: 0,label,input_ids,token_type_ids,attention_mask
0,4,"[101, 1045, 2106, 2025, 2514, 26608, 102]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]"
1,4,"[101, 1045, 2064, 2175, 2013, 3110, 2061, 2062...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,0,"[101, 1045, 2572, 9775, 1037, 3371, 2000, 2695...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,3,"[101, 1045, 2572, 2412, 3110, 16839, 9080, 128...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,0,"[101, 1045, 2572, 3110, 24665, 7140, 11714, 102]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...,...
19995,4,"[101, 1045, 2572, 2383, 7020, 2050, 7749, 4826...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
19996,2,"[101, 1045, 7887, 4737, 2055, 2037, 2954, 2114...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
19997,2,"[101, 1045, 2514, 2049, 2590, 2000, 3745, 2023...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
19998,2,"[101, 1045, 5621, 2514, 2008, 2065, 2017, 2024...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [43]:
id_to_label = {label : id for (id, label) in enumerate(classes)}
label_to_id = {id : label for label, id in id_to_label.items()}
id_to_label

{'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}

There are a few NLP preprocessing steps required. Firstly, we must remove contractions like "didnt" "cant" and change them to "did not" and "can not" which will provide better information. We then need to remove any noises such as punctuations and digits, and also any stopwords that we don't need. Afterwards, we can lemmatize the words (changing the words to its equivalent base form, simplifying it) for example, werewolves to werewolf, trying to try, and so on. Lastly, we call the tokenization function of ERNIE and use it to tokenize (effectively vectorizing the strings into numerical data able to be used by the ERNIE model). Finally, we can return the df with its label's encoded to be used for the model.

#3. Dataset Preparation

In [44]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_clean[['input_ids', 'token_type_ids', 'attention_mask']],
                                                    df_clean['label'],
                                                    test_size = 0.3,
                                                    stratify = df_clean['label'])

x_val, x_test, y_val, y_test = train_test_split(x_test,
                                                    y_test,
                                                    test_size = 0.5,
                                                    stratify = y_test)

In [45]:
df_train = pd.concat([x_train, y_train], axis=1)
df_val = pd.concat([x_val, y_val], axis=1)
df_test = pd.concat([x_test, y_test], axis=1)

In [46]:
from datasets import Dataset
ds_train = Dataset.from_pandas(df_train, preserve_index = False)
ds_val = Dataset.from_pandas(df_val, preserve_index = False)
ds_test = Dataset.from_pandas(df_test, preserve_index = False)

#4. Transfer Learning

In [53]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average = 'weighted')
    recall = recall_score(labels, preds, average = 'weighted')
    f1 = f1_score(labels, preds, average = 'weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [54]:
from transformers import ErnieForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

ernie = ErnieForSequenceClassification.from_pretrained(
    "nghuyong/ernie-2.0-base-en", num_labels = 6, id2label = id_to_label, label2id = label_to_id
)

training_args = TrainingArguments(
    output_dir="ERNIE",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps = 100
)

trainer = Trainer(
    model=ernie,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-2.0-base-en and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9298,0.381802,0.879,0.878162,0.879,0.874473
2,0.2794,0.201525,0.921667,0.922001,0.921667,0.921552
3,0.1784,0.185912,0.922,0.926371,0.922,0.922877
4,0.1356,0.178689,0.922667,0.926203,0.922667,0.923387
5,0.1177,0.172639,0.924,0.924927,0.924,0.924243


TrainOutput(global_step=550, training_loss=0.3084389747272838, metrics={'train_runtime': 716.6068, 'train_samples_per_second': 97.676, 'train_steps_per_second': 0.768, 'total_flos': 2155242666997296.0, 'train_loss': 0.3084389747272838, 'epoch': 5.0})

#6. Evaluation

In [55]:
preds = trainer.predict(ds_test)

In [56]:
preds.metrics

{'test_loss': 0.14702995121479034,
 'test_accuracy': 0.9356666666666666,
 'test_precision': 0.9377256470851234,
 'test_recall': 0.9356666666666666,
 'test_f1': 0.9363088336080941,
 'test_runtime': 9.7011,
 'test_samples_per_second': 309.244,
 'test_steps_per_second': 2.474}

The results of the model is pretty good. An accuracy of 94% and a lower validation and training loss each epoch shows that the model is goodly fit. Average precision, recall, and f1 is also good, denoting a good result for all classes. Perhaps with more epochs, the model can converge and get even better results.