In [25]:
import pandas as pd
import numpy as np 
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import torch
import tensorflow as tf
import keras_nlp
from sklearn.preprocessing import LabelEncoder
from ordered_set import OrderedSet

import sentiment_analysis_training as sat

In [26]:
print(tf.__version__)
print(keras_nlp.__version__)

2.20.0
0.25.1


### **English**

In [27]:
train_df = pd.read_csv('./Kaggle/Go_Emotions/data/train.tsv', sep='\t', header=None, names=['text','labels', 'code'])
validation_df = pd.read_csv('./Kaggle/Go_Emotions/data/dev.tsv', sep='\t', header=None, names=['text','labels', 'code'])
test_df = pd.read_csv('./Kaggle/Go_Emotions/data/test.tsv', sep='\t', header=None, names=['text','labels', 'code'])

print(train_df.shape, validation_df.shape, test_df.shape)
train_df.head()

(43410, 3) (5426, 3) (5427, 3)


Unnamed: 0,text,labels,code
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


In [28]:
train_df.drop(inplace=True, axis=1, labels=['code'])
validation_df.drop(inplace=True, axis=1, labels=['code'])
test_df.drop(inplace=True, axis=1, labels=['code'])

train_df.head()

Unnamed: 0,text,labels
0,My favourite food is anything I didn't have to...,27
1,"Now if he does off himself, everyone will thin...",27
2,WHY THE FUCK IS BAYLESS ISOING,2
3,To make her feel threatened,14
4,Dirty Southern Wankers,3


In [29]:
decode_labels = {}
i = 0
with open('./Kaggle/Go_Emotions/data/emotions.txt', 'r') as decoding:
    for line in decoding:
        decode_labels[i] = line.strip()
        i += 1
print(decode_labels)

{0: 'admiration', 1: 'amusement', 2: 'anger', 3: 'annoyance', 4: 'approval', 5: 'caring', 6: 'confusion', 7: 'curiosity', 8: 'desire', 9: 'disappointment', 10: 'disapproval', 11: 'disgust', 12: 'embarrassment', 13: 'excitement', 14: 'fear', 15: 'gratitude', 16: 'grief', 17: 'joy', 18: 'love', 19: 'nervousness', 20: 'optimism', 21: 'pride', 22: 'realization', 23: 'relief', 24: 'remorse', 25: 'sadness', 26: 'surprise', 27: 'neutral'}


In [30]:
def decoding(label):
    label = list(map(int, label.split(',')))
    decoded = []
    for x in label:
        decoded.append(decode_labels.get(x))
    return decoded

train_df['labels'] = train_df['labels'].apply(decoding)

train_df.head()

Unnamed: 0,text,labels
0,My favourite food is anything I didn't have to...,[neutral]
1,"Now if he does off himself, everyone will thin...",[neutral]
2,WHY THE FUCK IS BAYLESS ISOING,[anger]
3,To make her feel threatened,[fear]
4,Dirty Southern Wankers,[annoyance]


In [31]:
validation_df['labels'] = validation_df['labels'].apply(decoding)
test_df['labels'] = test_df['labels'].apply(decoding)

In [32]:
validation_df.head()

Unnamed: 0,text,labels
0,Is this in New Orleans?? I really feel like th...,[neutral]
1,"You know the answer man, you are programmed to...","[approval, neutral]"
2,I've never been this sad in my life!,[sadness]
3,The economy is heavily controlled and subsidiz...,"[approval, neutral]"
4,He could have easily taken a real camera from ...,[optimism]


In [33]:
test_df.head()

Unnamed: 0,text,labels
0,Iâ€™m really sorry about your situation :( Altho...,[sadness]
1,It's wonderful because it's awful. At not with.,[admiration]
2,"Kings fan here, good luck to you guys! Will be...",[excitement]
3,"I didn't know that, thank you for teaching me ...",[gratitude]
4,They got bored from haunting earth for thousan...,[neutral]


In [34]:
goemotions_to_fer = {
    "anger": "angry",
    "annoyance": "angry",
    "disapproval": "angry", 

    "disgust": "disgust",

    "fear": "fear",
    "nervousness": "fear",

    "admiration": "happy",
    "amusement": "happy",
    "approval": "happy",
    "caring": "happy",
    "excitement": "happy",
    "gratitude": "happy",
    "joy": "happy",
    "love": "happy",
    "optimism": "happy",
    "pride": "happy",
    "relief": "happy",

    "disappointment": "sad",
    "embarrassment": "sad",
    "grief": "sad",
    "remorse": "sad",
    "sadness": "sad",

    "surprise": "surprise",
    "realization": "surprise",
    "curiosity": "surprise",  
    "confusion": "surprise",

    "neutral": "neutral"  
}

In [35]:
def map_to_fer(go_labels, mapping):
    mapped = OrderedSet()
    for label in go_labels:
        if label in mapping:
            mapped.add(mapping[label])
    return list(mapped) if mapped else ["neutral"]  
train_df['fer_labels'] = train_df['labels'].apply(lambda labs: map_to_fer(labs, goemotions_to_fer))
validation_df['fer_labels'] = validation_df['labels'].apply(lambda labs: map_to_fer(labs, goemotions_to_fer))
test_df['fer_labels'] = test_df['labels'].apply(lambda labs: map_to_fer(labs, goemotions_to_fer))

In [36]:
train_df.head()

Unnamed: 0,text,labels,fer_labels
0,My favourite food is anything I didn't have to...,[neutral],[neutral]
1,"Now if he does off himself, everyone will thin...",[neutral],[neutral]
2,WHY THE FUCK IS BAYLESS ISOING,[anger],[angry]
3,To make her feel threatened,[fear],[fear]
4,Dirty Southern Wankers,[annoyance],[angry]


In [37]:
all_labels = train_df['fer_labels'].explode().value_counts()
print(all_labels)

fer_labels
happy       16935
neutral     14608
angry        5579
surprise     5367
sad          3263
disgust       793
fear          726
Name: count, dtype: int64


In [38]:
train_df['fer_label'] = train_df['fer_labels'].apply(lambda x: x[0])
validation_df['fer_label'] = validation_df['fer_labels'].apply(lambda x: x[0])
test_df['fer_label'] = test_df['fer_labels'].apply(lambda x: x[0])

In [39]:
train_df['sentiment_final'] = train_df['labels'].apply(lambda x: x[0])
validation_df['sentiment_final'] = validation_df['labels'].apply(lambda x: x[0])
test_df['sentiment_final'] = test_df['labels'].apply(lambda x: x[0])

In [40]:
train_df.head()

Unnamed: 0,text,labels,fer_labels,fer_label,sentiment_final
0,My favourite food is anything I didn't have to...,[neutral],[neutral],neutral,neutral
1,"Now if he does off himself, everyone will thin...",[neutral],[neutral],neutral,neutral
2,WHY THE FUCK IS BAYLESS ISOING,[anger],[angry],angry,anger
3,To make her feel threatened,[fear],[fear],fear,fear
4,Dirty Southern Wankers,[annoyance],[angry],angry,annoyance


In [41]:
map_dict = {
    "angry":0,
    "disgust":1,
    "fear":2,
    "happy":3,
    "sad":4,
    "surprise":5,
    "neutral":6
}

In [42]:
train_df['label_id'] = [map_dict[i] for i in train_df['fer_label']]
validation_df['label_id'] = [map_dict[i] for i in validation_df['fer_label']]
test_df['label_id'] = [map_dict[i] for i in test_df['fer_label']]

num_labels = len(map_dict)
print(num_labels)

7


In [43]:
train_texts = train_df['text'].tolist()
train_labels = train_df['label_id'].tolist()
val_texts = validation_df['text'].tolist()
val_labels = validation_df['label_id'].tolist()
test_texts = test_df['text'].tolist()
test_labels = test_df['label_id'].tolist()

In [44]:
train_ds = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
val_ds = tf.data.Dataset.from_tensor_slices((val_texts, val_labels))

train_ds = train_ds.shuffle(1000).batch(8).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(8).prefetch(tf.data.AUTOTUNE)

In [45]:
train_df.head()

Unnamed: 0,text,labels,fer_labels,fer_label,sentiment_final,label_id
0,My favourite food is anything I didn't have to...,[neutral],[neutral],neutral,neutral,6
1,"Now if he does off himself, everyone will thin...",[neutral],[neutral],neutral,neutral,6
2,WHY THE FUCK IS BAYLESS ISOING,[anger],[angry],angry,anger,0
3,To make her feel threatened,[fear],[fear],fear,fear,2
4,Dirty Southern Wankers,[annoyance],[angry],angry,annoyance,0


In [46]:
df = pd.concat([train_df, validation_df,test_df], ignore_index=True)

min_count = df['fer_label'].value_counts().min()
min_count
df = (
    df.groupby('fer_label', group_keys=False)
      .apply(lambda x: x.sample(min_count, random_state=42))
      .reset_index(drop=True)
)

df = df.drop(columns=['labels', 'fer_labels', 'label_id','sentiment_final'])
df = df.rename(columns={"fer_label": "sentiment_final"})    
 
df.head()

Unnamed: 0,text,sentiment_final
0,"I've said it before, and I'll say it again. ""I...",angry
1,There's a difference between rights and law. S...,angry
2,... that's simply wrong.,angry
3,No. I've been bashing it for its very noticabl...,angry
4,Guys. I think OP blew burns in college and wan...,angry


In [47]:
df['sentiment_final'].value_counts()    

sentiment_final
angry       741
disgust     741
fear        741
happy       741
neutral     741
sad         741
surprise    741
Name: count, dtype: int64

In [None]:
results, best_model_name, save_path = sat.main(df=df)

ðŸ“Š Preparando datos...
   Clases encontradas: {'angry': 0, 'disgust': 1, 'fear': 2, 'happy': 3, 'neutral': 4, 'sad': 5, 'surprise': 6}
   Total de muestras: 5187
   Train: 3630 | Validation: 778 | Test: 779

ðŸš€ Entrenando bert-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   Iniciando entrenamiento...


### **Espanol**

In [None]:
train_df_es = pd.read_csv('./Kaggle/task2-train-dev/train.tsv', sep='\t')
validation_df_es = pd.read_csv('./Kaggle/task2-train-dev/dev.tsv', sep='\t')

print(train_df_es.shape, validation_df_es.shape)
train_df_es.head()

(5886, 3) (857, 3)


Unnamed: 0,id,tweet,label
0,1,El AtlÃ©tico resignado a perder HASHTAG ðŸ˜” http...,sadness
1,2,Leer proporciona una mejor visiÃ³n del mundo ðŸ¤“ ...,joy
2,3,Amo a Arya Stark por encima de todas las cosas...,joy
3,4,Gracias HASHTAG es increÃ­ble que una niÃ±a logr...,others
4,5,Solo siento que hayamos perdido 24 escaÃ±os de ...,sadness


In [None]:
# train_df_es.drop(inplace=True, axis=1, labels=['label'])
# validation_df_es.drop(inplace=True, axis=1, labels=['label'])

train_df_es.head()

Unnamed: 0,id,tweet,label
0,1,El AtlÃ©tico resignado a perder HASHTAG ðŸ˜” http...,sadness
1,2,Leer proporciona una mejor visiÃ³n del mundo ðŸ¤“ ...,joy
2,3,Amo a Arya Stark por encima de todas las cosas...,joy
3,4,Gracias HASHTAG es increÃ­ble que una niÃ±a logr...,others
4,5,Solo siento que hayamos perdido 24 escaÃ±os de ...,sadness


In [None]:
df_es = pd.concat([train_df_es, validation_df_es], ignore_index=True)

min_count = df_es['label '].value_counts().min()

df_es = (
    df_es.groupby('label ', group_keys=False)
      .apply(lambda x: x.sample(min_count, random_state=42))
      .reset_index(drop=True)
)

# df_es = df_es.drop(columns=['label', 'fer_label', 'label_id','sentiment_final'])
df_es = df_es.rename(columns={"label ": "sentiment_final","tweet": "text"})

df_es

Unnamed: 0,id,text,sentiment_final
0,2737,"Usuarios lamentan incendio en HASHTAG, pero le...",anger
1,554,Esta pÃ©simo que aprovechen para sacar sus foto...,anger
2,2467,Esta bien que se protejan y que mejor que en l...,anger
3,3133,Ya van a empezar con sus fotos de cuando conoc...,anger
4,2973,Ã‰ste Hijo de Puta Marxista va a reprimir a los...,anger
...,...,...,...
534,712,CÃ³mo que anoche HASHTAG no os gustÃ³ pero quÃ© o...,surprise
535,1930,HASHTAG Feliz dÃ­a HASHTAG HASHTAG Aprovecha el...,surprise
536,455,Asi estoy despuÃ©s de ver el HASHTAG ðŸ¤¯ðŸ¤¯ðŸ¤¯ El de ...,surprise
537,2636,No tengo palabras para el capÃ­tulo que acabÃ¡mo...,surprise


In [None]:
df_es['sentiment_final'].value_counts()    

sentiment_final
anger        77
disgust      77
fear         77
joy          77
others       77
sadness      77
surprise     77
Name: count, dtype: int64

In [None]:
results, best_model_name, save_path = sat.main(df=df_es)

ðŸ“Š Preparando datos...
   Clases encontradas: {'anger ': 0, 'disgust ': 1, 'fear ': 2, 'joy ': 3, 'others ': 4, 'sadness ': 5, 'surprise ': 6}
   Total de muestras: 539
   Train: 377 | Validation: 81 | Test: 81

ðŸš€ Entrenando bert-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   Iniciando entrenamiento...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 