# Import Data

In [None]:
# Get Dataset
#Kaggle: https://www.kaggle.com/datasets/shivamb/go-emotions-google-emotions-dataset

!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

In [1]:
!nvidia-smi

Fri Apr 14 20:19:45 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti    WDDM | 00000000:29:00.0  On |                  N/A |
| 30%   33C    P0               42W / 200W|    631MiB /  8192MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import preprocessor
import contractions
import json
import re
from collections import OrderedDict
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas()

In [3]:
DATA_PATH = 'data/full_dataset/goemotions_'
OUTPUT_DIR = 'training_data'

df1 = pd.read_csv(f'{DATA_PATH}1.csv')
df2 = pd.read_csv(f'{DATA_PATH}2.csv')
df3 = pd.read_csv(f'{DATA_PATH}3.csv')

frames = [df1,df2,df3]

df = pd.concat(frames)

df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


# Preprocess Data

In [4]:
# FROM: https://www.kaggle.com/code/esknight/emotion-classification-final
# Function for cleaning text
def clean_text(text):
    re_number = re.compile('[0-9]+')
    re_url = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
    re_tag = re.compile('\[[A-Z]+\]')
    re_char = re.compile('[^0-9a-zA-Z\s?!.,:\'\"//]+')
    re_char_clean = re.compile('[^0-9a-zA-Z\s?!.,\[\]]')
    re_punc = re.compile('[?!,.\'\"]')
  
    text = re.sub(re_char, "", text) # Remove unknown character 
    text = contractions.fix(text) # Expand contraction
    text = re.sub(re_url, ' [url] ', text) # Replace URL with number
    text = re.sub(re_char_clean, "", text) # Only alphanumeric and punctuations.
    #text = re.sub(re_punc, "", text) # Remove punctuation.
    text = text.lower() # Lower text
    text = " ".join([w for w in text.split(' ') if w != " "]) # Remove whitespace

    return text

In [5]:
# Clean text
df['clean_text'] = df['text'].progress_apply(clean_text)

# Drop Useless Columns
df = df.drop(columns=['id','example_very_unclear','author','subreddit','link_id','parent_id','created_utc','rater_id'])

# Reorganize Columns
df = df[['clean_text'] + [col for col in df.columns if col not in ['text','clean_text']]]

  0%|          | 0/211225 [00:00<?, ?it/s]

In [6]:
#View emotions easier
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
emotions

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [7]:
id2label = {str(i):label for i, label in enumerate(emotions)}
label2id = {label:str(i) for i, label in enumerate(emotions)}

In [8]:
print(id2label)
print(label2id)

{'0': 'admiration', '1': 'amusement', '2': 'anger', '3': 'annoyance', '4': 'approval', '5': 'caring', '6': 'confusion', '7': 'curiosity', '8': 'desire', '9': 'disappointment', '10': 'disapproval', '11': 'disgust', '12': 'embarrassment', '13': 'excitement', '14': 'fear', '15': 'gratitude', '16': 'grief', '17': 'joy', '18': 'love', '19': 'nervousness', '20': 'optimism', '21': 'pride', '22': 'realization', '23': 'relief', '24': 'remorse', '25': 'sadness', '26': 'surprise', '27': 'neutral'}
{'admiration': '0', 'amusement': '1', 'anger': '2', 'annoyance': '3', 'approval': '4', 'caring': '5', 'confusion': '6', 'curiosity': '7', 'desire': '8', 'disappointment': '9', 'disapproval': '10', 'disgust': '11', 'embarrassment': '12', 'excitement': '13', 'fear': '14', 'gratitude': '15', 'grief': '16', 'joy': '17', 'love': '18', 'nervousness': '19', 'optimism': '20', 'pride': '21', 'realization': '22', 'relief': '23', 'remorse': '24', 'sadness': '25', 'surprise': '26', 'neutral': '27'}


In [9]:
# One-Hot Encoding all Emotions
df["labels"] = df[emotions].values.tolist()
df.head()

Unnamed: 0,clean_text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,labels
0,that game hurt.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,sexuality should not be a grouping category i...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"you do right, if you do not care then fuck them!",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,man i love reddit.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"name was nowhere near them, he was by the falc...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
# create train / test splits
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]

(df_train.shape, df_test.shape)

((168971, 30), (42254, 30))

# Data Visualization

In [None]:
# Emotions Visualization by number of cases

temp = df[list(emotions)].sum(axis=0) \
    .reset_index() \
    .rename(columns={'index': 'emotion', 0: 'n'}) \
    .sort_values('n', ascending=False)

fig, ax = plt.subplots(figsize=(7, 7))
ax.tick_params(axis='x', rotation=90)
sns.barplot(data=temp, x='n', 
            y='emotion',
            dodge=False,
            ax=ax).set_title('Emotions by number of appearances')

In [None]:
# Concatenating emotions

pos = {'admiration','amusement','approval','caring','desire','excitement','gratitude','joy','love',
       'optimism','pride','relief'}
neg = {'sadness','fear','embarrassment','disapproval','disappointment','annoyance','anger','nervousness',
       'remorse','grief','disgust'}
amb= {'realization','surprise','curiosity','confusion','neutral'}

In [None]:
# Emotions and data vis

print("Length of data: ", len(df))
print("Number of emotions: ", len(emotions))
print("Number of positive emotions: ", len(pos))
print("Number of negative emotions: ", len(neg))
print("Number of ambiguous emotions: ", len(amb))

In [None]:
# Emotions dataframe to later on aggregate

df_emotion = pd.DataFrame()
df_emotion['emotion'] = list(emotions)
df_emotion['group'] = ''
df_emotion['group'].loc[df_emotion['emotion'].isin(pos)] = 'positive'
df_emotion['group'].loc[df_emotion['emotion'].isin(neg)] = 'negative'
df_emotion['group'].loc[df_emotion['emotion'].isin(amb)] = 'ambiguous'

In [None]:
df_emotion.head()

In [None]:
# Emotions by number of appearences but by group

temp = pd.DataFrame()
temp['true positive rate'] = df.iloc[:, 3:-1].mean(0)
temp['emotion'] = df.columns[3:-1]
temp = temp.merge(df_emotion, how='left', on='emotion')
temp = temp.sort_values('true positive rate')

fig, ax = plt.subplots(figsize=(12, 7))
ax.tick_params(axis='x', rotation=90)

sns.barplot(x=temp['emotion'], 
            y=temp['true positive rate'], 
            hue=temp['group'], 
            dodge=False)

In [None]:
def represent_train_test_balance(train_df,test_df):
    # Class representation for train/test DS
    train_GO = (train_df.loc[:,list(emotions)].sum(axis=0) / len(train_df)) * 100
    test_GO = (test_df.loc[:,list(emotions)].sum(axis=0) / len(test_df)) * 100
    
    # Unique dataset for visualization purposes
    
    ds_GO = pd.DataFrame(data=[train_GO, test_GO]).T.reset_index(drop=False)
    ds_GO.columns = ['Emotion', 'Train','Test']
    ds_GO = ds_GO.sort_values('Train',ascending=False)
    ds_GO = ds_GO.melt(id_vars='Emotion', var_name='Dataset', value_vars=['Train','Test'],
                      value_name='Percentage')
    
    # Display dataset
    
    display(ds_GO.head(10))
    
    print("Graph Visualization")
    
    plt.figure(figsize=(20,15))
    sns.barplot(x='Percentage', y='Emotion', data=ds_GO, orient='h', hue='Dataset')
    plt.title('Percentage of samples per emotion in train and test datasets', fontweight='bold', fontsize=20)
    plt.xlabel('Percentage of all samples', fontweight='bold', fontsize=16)
    plt.ylabel('Emotions', fontweight='bold', fontsize= 16)
    plt.show()
represent_train_test_balance(df_train, df_test)

# Tokenization / Encoding / Method Structuring

In [None]:
# Imports
from transformers import AutoTokenizer, TrainingArguments, Trainer, DistilBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification, XLNetForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.xlnet.modeling_xlnet import XLNetForSequenceClassificationOutput
from torch import nn
import random
import torch
import platform
import sys
import sklearn as sk
from typing import Optional, Union, Tuple


In [None]:
MAX_LEN = 256

In [None]:
class GoEmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx].clone().detach() for key, val in self.encodings}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def train_test(df_train, df_test, tokenizer): 
  # Encodings

  train_encodings = tokenizer(df_train["clean_text"].values.tolist(), truncation=True)
  test_encodings = tokenizer(df_test["clean_text"].values.tolist(), truncation=True)

  # labels / output
  train_emotions = df_train["labels"].values.tolist()
  test_emotions = df_test["labels"].values.tolist()

  train_dataset = GoEmotionDataset(train_encodings, train_emotions)
  test_dataset = GoEmotionDataset(test_encodings, test_emotions)
  return train_dataset, test_dataset
  
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    y_pred = torch.from_numpy(logits)
    y_true = torch.from_numpy(labels)
    y_pred = y_pred.sigmoid()
    y_pred = y_pred>0.5
    y_true = y_true.bool()
    acc = (y_pred==y_true).float().mean().item()

    return {       
      'Accuracy': acc
    }
    
def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=False
    torch.backends.cudnn.benchmark = False
    
def device_to_use():
    has_gpu = torch.cuda.is_available()
    has_mps = getattr(torch,'has_mps',False)
    device = "mps" if getattr(torch,'has_mps',False) \
        else "gpu" if torch.cuda.is_available() else "cpu"

    print(f"Python Platform: {platform.platform()}")
    print(f"PyTorch Version: {torch.__version__}")
    print()
    print(f"Python {sys.version}")
    print(f"Pandas {pd.__version__}")
    print(f"Scikit-Learn {sk.__version__}")
    print("GPU is", "available" if has_gpu else "NOT AVAILABLE")
    print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
    print(f"Target device is {device}")
    return device

def model_train(train_dataset, test_dataset, model, tokenizer, NUM_EPOCHS = 10,batch_size = 16, adam_epsilon_arg = 1e-8, learning_rate_arg = 2e-5, use_mps_device_arg = False, model_name = "default"):
  training_args = TrainingArguments( 
    output_dir= OUTPUT_DIR+"/"+model_name,    
    adam_epsilon = adam_epsilon_arg,
    learning_rate = learning_rate_arg,
    use_mps_device = use_mps_device_arg, # Mac Sylicon GPU
    per_device_train_batch_size = batch_size, 
    per_device_eval_batch_size = batch_size*4,
    gradient_accumulation_steps = 2, # scale batch size without needing more memory
    num_train_epochs= NUM_EPOCHS,
    do_eval = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True, # this allows to automatically get the best model at the end based on whatever metric we want
    metric_for_best_model = 'Accuracy',
    greater_is_better = True,
    weight_decay=0.01,
    seed = 25,
    report_to="none"
  )
  set_seed(training_args.seed)
  trainer = Trainer(
      model = model,
      args = training_args,
      train_dataset = train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics,
      tokenizer=tokenizer
  )
  return training_args, trainer
                                              

In [None]:
# Classes to Each Model

class DistilBertForMultilabelSequenceClassification(DistilBertForSequenceClassification):
    def __init__(self, config):
      super().__init__(config)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[SequenceClassifierOutput, Tuple[torch.Tensor, ...]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.float().view(-1, self.num_labels))

        if not return_dict:
            output = (logits,) + distilbert_output[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions)

class BertForMultilabelSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
      super().__init__(config)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        bert_output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = bert_output[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.float().view(-1, self.num_labels))

        if not return_dict:
            output = (logits,) + bert_output[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
            logits=logits,
            hidden_states=bert_output.hidden_states,
            attentions=bert_output.attentions)

class RoBertaForMultilabelSequenceClassification(RobertaForSequenceClassification):
    def __init__(self, config):
      super().__init__(config)

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        roberta_output = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = roberta_output[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.float().view(-1, self.num_labels))

        if not return_dict:
            output = (logits,) + roberta_output[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
            logits=logits,
            hidden_states=roberta_output.hidden_states,
            attentions=roberta_output.attentions)

class XLNetForMultilabelSequenceClassification(XLNetForSequenceClassification):
    def __init__(self, config):
      super().__init__(config)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        mems: Optional[torch.Tensor] = None,
        perm_mask: Optional[torch.Tensor] = None,
        target_mapping: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        input_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_mems: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,  # delete when `use_cache` is removed in XLNetModel
    ) -> Union[Tuple, XLNetForSequenceClassificationOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        xlnet_output = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_mems=use_mems,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs)
        output = xlnet_output[0]
        output = self.sequence_summary(output)
        logits = self.logits_proj(output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.float().view(-1, self.num_labels))

        if not return_dict:
            output = (logits,) + xlnet_output[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
            logits=logits,
            hidden_states=xlnet_output.hidden_states,
            attentions=xlnet_output.attentions)

# Pre-Trained Model - DistilBERT

In [None]:
model_path_or_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
num_labels=len(emotions)
device = device_to_use()
if device == 'gpu': device = 'cuda'
model = DistilBertForMultilabelSequenceClassification.from_pretrained(model_path_or_name, num_labels=num_labels).to(device)
model = model_config_ids(model, id2label, label2id)

In [None]:
train_dataset, test_dataset = train_test(df_train, df_test, tokenizer)
training_args, trainer = model_train(train_dataset, test_dataset, model, tokenizer, NUM_EPOCHS = 3,batch_size = 16, adam_epsilon_arg = 1e-8, learning_rate_arg = 2e-5, use_mps_device_arg = False, model_name = "distilbert")

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

# Pre-Trained Model - BERT

In [None]:
model_path_or_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
num_labels=len(emotions)
device = device_to_use()
if device == 'gpu': device = 'cuda'
model = BertForMultilabelSequenceClassification.from_pretrained(model_path_or_name, num_labels=num_labels).to(device)
model = model_config_ids(model, id2label, label2id)

In [None]:
train_dataset, test_dataset = train_test(df_train, df_test, tokenizer)
training_args, trainer = model_train(train_dataset, test_dataset, model, tokenizer, NUM_EPOCHS = 3,batch_size = 16, adam_epsilon_arg = 1e-8, learning_rate_arg = 2e-5, use_mps_device_arg = False, model_name = "bert")

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

# Pre-Trained Model - RoBERTa

In [None]:
model_path_or_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
num_labels=len(emotions)
device = device_to_use()
if device == 'gpu': device = 'cuda'
model = RoBertaForMultilabelSequenceClassification.from_pretrained(model_path_or_name, num_labels=num_labels).to(device)
model = model_config_ids(model, id2label, label2id)

In [None]:
train_dataset, test_dataset = train_test(df_train, df_test, tokenizer)
training_args, trainer = model_train(train_dataset, test_dataset, model, tokenizer, NUM_EPOCHS = 3,batch_size = 16, adam_epsilon_arg = 1e-8, learning_rate_arg = 2e-5, use_mps_device_arg = False, model_name = "roberta")

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

# Pre-Trained Model - XLNet

In [None]:
model_path_or_name = "xlnet-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
num_labels=len(emotions)
device = device_to_use()
if device == 'gpu': device = 'cuda'
model = XLNetForMultilabelSequenceClassification.from_pretrained(model_path_or_name, num_labels=num_labels).to(device)
model = model_config_ids(model, id2label, label2id)

In [None]:
train_dataset, test_dataset = train_test(df_train, df_test, tokenizer)
training_args, trainer = model_train(train_dataset, test_dataset, model, tokenizer, NUM_EPOCHS = 3,batch_size = 8, adam_epsilon_arg = 1e-8, learning_rate_arg = 2e-5, use_mps_device_arg = False, model_name = "xlnet")

In [None]:
trainer.train()

# Transformer from Scratch - All you need!

Tokenizer - Still want to try some new ones

In [11]:
import time
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from transformer import MultilabelSequenceClassificationTransformer, MultilabelLocalAttentionSequenceClassificationTransformer
# Imports
from transformers import AutoTokenizer, TrainingArguments, Trainer, DistilBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification, XLNetForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.xlnet.modeling_xlnet import XLNetForSequenceClassificationOutput
from torch import nn
import random
import torch
import platform
import sys
import sklearn as sk
from typing import Optional, Union, Tuple
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, accuracy_score

In [12]:
spacy_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

In [13]:
class GoEmotionDatasetScratch(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx].clone().detach() for key in self.encodings}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
def create_word_to_ix(df_train, df_test):
    word_to_ix = {'<pad>': 0, '<unk>': 1}
    for text in pd.concat([df_train["clean_text"], df_test["clean_text"]]):
        for token in spacy_tokenizer(text):
            if token not in word_to_ix:
                word_to_ix[token] = len(word_to_ix)
    return word_to_ix

In [15]:
def encode_text(text, word_to_ix, max_length=128):
    tokens = [t for t in spacy_tokenizer(text)]
    input_ids = [word_to_ix.get(token, word_to_ix['<unk>']) for token in tokens][:max_length]
    input_ids = input_ids + [0] * (max_length - len(input_ids))
    attention_mask = [1 if token_id != 0 else 0 for token_id in input_ids]

    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long),
        'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
    }

In [16]:
def train_test(df_train, df_test, tokenizer, word_to_ix):
    # Encodings
    train_encoded_texts = [encode_text(text, word_to_ix) for text in df_train["clean_text"].values.tolist()]
    test_encoded_texts = [encode_text(text, word_to_ix) for text in df_test["clean_text"].values.tolist()]

    train_encodings = {
        'input_ids': [text_encoding['input_ids'] for text_encoding in train_encoded_texts],
        'attention_mask': [text_encoding['attention_mask'] for text_encoding in train_encoded_texts]
    }

    test_encodings = {
        'input_ids': [text_encoding['input_ids'] for text_encoding in test_encoded_texts],
        'attention_mask': [text_encoding['attention_mask'] for text_encoding in test_encoded_texts]
    }

    # labels / output
    train_emotions = df_train["labels"].values.tolist()
    test_emotions = df_test["labels"].values.tolist()

    train_dataset = GoEmotionDatasetScratch(train_encodings, train_emotions)
    test_dataset = GoEmotionDatasetScratch(test_encodings, test_emotions)

    return train_dataset, test_dataset, len(word_to_ix)

In [17]:
def train_model(model, train_dataset, val_dataset, epochs, batch_size, device, lr=0.001, weight_decay=0.01, warmup_steps=0, patience = 5):
    
    # Create DataLoaders for the training and validation datasets
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,pin_memory=True)

    # Initialize the optimizer with model parameters, learning rate, and weight decay
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    model.to(device)

    # Calculate the total number of training steps and create the learning rate scheduler
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

    for epoch in range(epochs):
        start_time = time.time()
        model.train()
        train_loss = 0

        for idx, batch in enumerate(train_loader):

            # Reset the gradients for the optimizer
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            loss, _ = model(input_ids, labels=labels)

            # Backward pass to compute gradients
            loss.backward()

            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Update model parameters using the optimizer
            optimizer.step()

            # Update the learning rate using the scheduler
            scheduler.step()

            train_loss += loss.item()

            if (idx + 1) % 500 == 0:
                print(f"Epoch {epoch + 1}/{epochs} | Batch {idx + 1}/{len(train_loader)} | Train Loss: {loss.item():.4f}")

        model.eval()
        val_loss = 0
        val_acc = 0
        val_f1 = 0
        num_val_batches = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["labels"].to(device)
                loss, logits = model(input_ids, labels=labels)
                val_loss += loss.item()
                y_pred = logits.cpu()
                y_true = labels.cpu()
                y_pred = y_pred.sigmoid()
                y_pred = y_pred>0.5
                y_true = y_true.bool()
                f1 = f1_score(y_true, y_pred, average='micro')
                acc = (y_pred==y_true).float().mean().item()
                val_f1 += f1
                val_acc += acc
                num_val_batches += 1

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_acc /= num_val_batches
        val_f1 /= num_val_batches
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Epoch {epoch + 1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f} | Val Acc: {val_acc:.4f} | Time: {elapsed_time:.2f}s")
        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else:
            counter += 1
            print(f"EarlyStopping counter: {counter} out of {patience}")
            if counter >= patience:
                print("Early stopping")
                break

    return best_val_loss, val_acc, val_f1


In [18]:
word_to_ix = create_word_to_ix(df_train, df_test)
train_dataset, test_dataset, vocab_size = train_test(df_train, df_test, spacy_tokenizer, word_to_ix)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_pad_idx = word_to_ix['<pad>']

In [None]:
# Normal 
model2 = MultilabelSequenceClassificationTransformer(
    src_vocab_size= vocab_size,
    num_classes= len(emotions),
    src_pad_idx= src_pad_idx,
    emb_size = 128,
    max_len=128
).to(device)

In [25]:
import pickle
with open('dict_results_hyperparams.pkl', 'rb') as fp:
    results_hyperparam = pickle.load(fp)
    print('Results hyper parameters dictionary')
    print(results_hyperparam)
best_result = min(results_hyperparam, key=lambda x: x["val_loss"])

best_result = best_result['params']
MAX_LEN = 512

Results hyper parameters dictionary
[{'params': {'lr': 1e-05, 'num_layers': 6, 'heads': 8, 'dropout': 0.2, 'emb_size': 512, 'forward_expansion': 4, 'batch_size': 64, 'weight_decay': 1e-05}, 'val_loss': 0.1694728625672204, 'val_acc': 0.9588509287152972}, {'params': {'lr': 1e-05, 'num_layers': 6, 'heads': 8, 'dropout': 0.2, 'emb_size': 512, 'forward_expansion': 4, 'batch_size': 64, 'weight_decay': 0.001}, 'val_loss': 0.16154434851237706, 'val_acc': 0.9588509287152972}, {'params': {'lr': 1e-05, 'num_layers': 6, 'heads': 8, 'dropout': 0.2, 'emb_size': 512, 'forward_expansion': 8, 'batch_size': 64, 'weight_decay': 1e-05}, 'val_loss': 0.17052964653287614, 'val_acc': 0.9588509287152972}, {'params': {'lr': 1e-05, 'num_layers': 6, 'heads': 8, 'dropout': 0.2, 'emb_size': 512, 'forward_expansion': 8, 'batch_size': 64, 'weight_decay': 0.001}, 'val_loss': 0.1628226169518062, 'val_acc': 0.9588509287152972}, {'params': {'lr': 1e-05, 'num_layers': 6, 'heads': 16, 'dropout': 0.2, 'emb_size': 512, 'forw

In [26]:
model = MultilabelLocalAttentionSequenceClassificationTransformer(
            src_vocab_size=vocab_size,
            num_classes=len(emotions),
            src_pad_idx=src_pad_idx,
            emb_size=best_result["emb_size"],
            num_layers=best_result["num_layers"],
            forward_expansion=best_result["forward_expansion"],
            heads=best_result["heads"],
            device=device,
            max_len=MAX_LEN
        ).to(device)

In [27]:
# Train Model
train_model(model, train_dataset, test_dataset, epochs=50, batch_size=best_result['batch_size'], device=device, lr=best_result['lr'])

In [None]:
# Train Model
torch.manual_seed(42)
epochs = 10
batch_size = 128
lr = 1e-4
train_model(model2, train_dataset, test_dataset, epochs, batch_size, device, lr)

In [None]:
def train_model_debug(model, train_dataset, val_dataset, epochs, batch_size, device, lr=0.001, weight_decay=0.01, warmup_steps=0):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,pin_memory=True)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    model.to(device)

    for _ in range(epochs):
        model.train()

        for _, batch in enumerate(train_loader):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            loss, logits = model(input_ids, labels=labels)
            return loss, logits, labels


In [None]:
# To verify everything was correct, which is
loss, logits, labels = train_model_debug(model, train_dataset, test_dataset, epochs, batch_size, device, lr)
print(logits[0]) # Which will then be normalized
print(labels[0])
loss_fct = torch.nn.BCEWithLogitsLoss()
loss = loss_fct(logits[0], labels[0].float())
print(loss)
loss = loss_fct(logits, labels.float())
print(loss)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def analyze_dataset(dataset):
    label_counts = np.zeros(28)
    sequence_lengths = []

    for idx in range(len(dataset)):
        sample = dataset[idx]
        input_ids = sample['input_ids']
        labels = sample['labels']

        sequence_lengths.append(len(input_ids))
        label_counts += labels.numpy()

        # Check if there are NaN or infinite values in input data
    if torch.isnan(input_ids).any() or torch.isinf(input_ids).any():
        print(f"NaN or infinite values found in input_ids at batch {idx + 1}")

    if torch.isnan(labels).any() or torch.isinf(labels).any():
        print(f"NaN or infinite values found in labels at batch {idx + 1}")

    # Normalize label counts
    label_counts /= len(dataset)

    # Analyze the distribution of sequence lengths
    plt.hist(sequence_lengths, bins=50)
    plt.xlabel('Sequence Length')
    plt.ylabel('Frequency')
    plt.title('Distribution of Sequence Lengths')
    plt.show()

    # Analyze the distribution of labels
    plt.bar(range(28), label_counts)
    plt.xlabel('Label')
    plt.ylabel('Frequency')
    plt.title('Distribution of Labels')
    plt.show()

    print(f"Average sequence length: {np.mean(sequence_lengths):.2f}")
    print(f"Standard deviation of sequence length: {np.std(sequence_lengths):.2f}")
    print(f"Label frequencies: {label_counts}")

In [None]:
analyze_dataset(train_dataset)

# Transformers - Hyper Parameters Tuning

In [None]:
import itertools

In [None]:
MAX_LEN = 512

In [None]:
def hyperparameter_search(data, hyperparameters, device):
    results = []

    for values in itertools.product(*hyperparameters.values()):
        params = dict(zip(hyperparameters.keys(), values))
        print(f"Training with hyperparameters: {params}")

        # Create a new model with the current hyperparameters
        model = MultilabelSequenceClassificationTransformer(
            src_vocab_size=vocab_size,
            num_classes=len(emotions),
            src_pad_idx=src_pad_idx,
            emb_size=params["emb_size"],
            num_layers=params["num_layers"],
            forward_expansion=params["forward_expansion"],
            heads=params["heads"],
            dropout=params["dropout"],
            device=device,
            max_len=MAX_LEN
        ).to(device)

        # Train the model and get the validation loss
        val_loss, val_acc = train_model(
            model, train_dataset, test_dataset, epochs=5, batch_size = params["batch_size"], device = device, lr=params["lr"], weight_decay=params['weight_decay'], warmup_steps=0
        )

        # Save the results
        results.append({"params": params, "val_loss": val_loss, "val_acc": val_acc})

    return results

In [None]:
data_sample = df.sample(frac=0.01, random_state=42)
mask = np.random.rand(len(data_sample)) < 0.8
df_train_sample = data_sample[mask]
df_test_sample = data_sample[~mask]

word_to_ix = create_word_to_ix(df_train, df_test)
train_dataset, test_dataset, vocab_size = train_test(df_train_sample, df_test_sample, spacy_tokenizer, word_to_ix)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_pad_idx = word_to_ix['<pad>']

In [None]:
hyperparameters = {
    "lr": [1e-5, 1e-3],
    "num_layers": [6, 12],
    "heads": [8, 16],
    "dropout": [0.2],
    "emb_size": [512],
    "forward_expansion": [4, 8],
    "batch_size": [64],
    "weight_decay": [1e-5,1e-3]
}

In [None]:
# Perform the hyperparameter search
search_results = hyperparameter_search(data_sample, hyperparameters, device)

# Print the best hyperparameters
best_result = min(search_results, key=lambda x: x["val_loss"])
print(f"Best hyperparameters: {best_result['params']} with validation loss: {best_result['val_loss']}")

In [None]:
best_result = min(results_hyperparam, key=lambda x: x["val_loss"])

In [None]:
# Create some new options just to verify some hyper params that weren't searched. ´
# This is because they're not so relevant and were causing problems running by the fact of GPU VRAM limitations
# Separated in two dicts in order to work with the 8GB VRAM
params_hp_1 = {
    "dropout": [0.1, 0.4],
    "emb_size": [128,256]
    }
params_hp_2 = {
    "batch_size": [32, 128],
    "weight_decay":[0]

}
list_params = []
for key, value in params_hp_1.items():
    for v in value:
        new_params = dict(best_result["params"])
        new_params.update({key:v})
        list_params.append(new_params)

In [None]:
def hyperparameter_research_ft(params_list, results,device):
    for params in params_list:
        print(f"Training with hyperparameters: {params}")
        # Create a new model with the current hyperparameters
        model = MultilabelSequenceClassificationTransformer(
            src_vocab_size=vocab_size,
            num_classes=len(emotions),
            src_pad_idx=src_pad_idx,
            emb_size=params["emb_size"],
            num_layers=params["num_layers"],
            forward_expansion=params["forward_expansion"],
            heads=params["heads"],
            dropout=params["dropout"],
            device=device,
            max_len=MAX_LEN
        ).to(device)

        # Train the model and get the validation loss
        val_loss, val_acc = train_model(
            model, train_dataset, test_dataset, epochs=5, batch_size = params["batch_size"], device = device, lr=params["lr"], weight_decay=params['weight_decay'], warmup_steps=0
        )

        # Save the results
        results.append({"params": params, "val_loss": val_loss, "val_acc": val_acc})

    return results

In [None]:
import pickle

In [None]:
# Load File

with open('dict_results_hyperparams.pkl', 'rb') as fp:
    results_hyperparam = pickle.load(fp)
    print('Results hyper parameters dictionary')
    print(results_hyperparam)

In [None]:
search_results = hyperparameter_research_ft(list_params, results_hyperparam, device)

In [53]:
# Save File

with open('dict_results_hyperparams.pkl', 'wb') as fp:
    pickle.dump(results_hyperparam, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


In [None]:
# Load File

with open('dict_results_hyperparams.pkl', 'rb') as fp:
    results_hyperparam = pickle.load(fp)
    print('Results hyper parameters dictionary')
    print(results_hyperparam)

# Compare Models

In [None]:
import json

BERT_PATH = 'training_data/bert/checkpoint-15816/trainer_state.json'
DISTILBERT_PATH = 'training_data/distilbert/checkpoint-15876/trainer_state.json'
ROBERTA_PATH = 'training_data/roberta/checkpoint-15816/trainer_state.json'
XLNET_PATH = 'training_data/xlnet/checkpoint-31710/trainer_state.json'


model_files = [BERT_PATH, DISTILBERT_PATH, ROBERTA_PATH, XLNET_PATH]

results = {}

for model_file in model_files:
    with open(model_file) as f:
        data = json.load(f)
        for item in data['log_history']:
            if 'epoch' in item and item['epoch'] == 3.0:
                results[model_file.split('/')[1]] = item
                break

print(results)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# assuming the 'results' dictionary has been created as in the previous example

metrics = ['eval_Accuracy', 'eval_loss', 'eval_runtime']

# create a bar chart for accuracy and loss
fig, ax = plt.subplots()
x = np.arange(len(model_files))

for i, metric in enumerate(metrics[:-1]):
    values = [results[model_file.split('/')[1]].get(metric, np.nan) for model_file in model_files]
    ax.bar(x + i*0.25 - 0.25, values, width=0.25, label=metric)

ax.set_xticks(x)
ax.set_xticklabels([model_file.split('/')[1] for model_file in model_files], rotation=45, ha='right')
ax.legend()

plt.show()

# create a bar chart for runtime (logarithmic scale)
fig, ax = plt.subplots()
x = np.arange(len(model_files))

values = [results[model_file.split('/')[1]].get('eval_runtime', np.nan) for model_file in model_files]
ax.bar(x, values, log=True, color='orange')

ax.set_xticks(x)
ax.set_xticklabels([model_file.split('/')[1] for model_file in model_files], rotation=45, ha='right')
ax.set_ylabel('eval_runtime (seconds)')
ax.set_xlabel('model')
ax.set_title('Evaluation Runtime at Epoch 3.0')

plt.show()

# Implementation Pre-Trained DistilBert with own Dataset and Testing

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
# Import model

DISTILBERT_PATH = 'training_data/distilbert/checkpoint-15876'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DistilBertForMultilabelSequenceClassification.from_pretrained(DISTILBERT_PATH).to(device)
tokenizer = AutoTokenizer.from_pretrained(DISTILBERT_PATH)

In [None]:
import pandas as pd

In [None]:
df_consumer = pd.read_csv('data/consumer_data_text.csv')

In [None]:
df_consumer.head()

In [None]:
df_consumer['clean_text'] = df_consumer['consumer_complaint'].progress_apply(clean_text)

In [None]:
df_consumer.head()

In [None]:
def predict_label(text, threshold=0.5):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs= model(**inputs)
    logits = outputs.logits.detach().cpu().numpy()[0] # assuming batch size of 1
    probs = softmax(logits)
    probs_scaled = (probs - np.min(probs)) / (np.max(probs) - np.min(probs))
    binary_preds = np.where(probs_scaled >= threshold, 1, 0)
    return binary_preds

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [None]:
df_consumer['predicted_label'] = df_consumer['clean_text'].apply(predict_label)

In [None]:
df_consumer.head()

In [None]:
array = df_consumer['predicted_label'].head(10).tolist()

In [None]:
array

# Implementing with own Transformer

In [12]:
import time
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from transformer import MultilabelSequenceClassificationTransformer, MultilabelLocalAttentionSequenceClassificationTransformer
# Imports
from transformers import AutoTokenizer, TrainingArguments, Trainer, DistilBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification, XLNetForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.xlnet.modeling_xlnet import XLNetForSequenceClassificationOutput
from torch import nn
import random
import torch
import platform
import sys
import sklearn as sk
from typing import Optional, Union, Tuple
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, accuracy_score

In [13]:
spacy_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

In [14]:
def create_word_to_ix(df_train, df_test):
    word_to_ix = {'<pad>': 0, '<unk>': 1}
    for text in pd.concat([df_train["clean_text"], df_test["clean_text"]]):
        for token in spacy_tokenizer(text):
            if token not in word_to_ix:
                word_to_ix[token] = len(word_to_ix)
    return word_to_ix

In [15]:
def encode_text(text, word_to_ix, max_length=128):
    tokens = [t for t in spacy_tokenizer(text)]
    input_ids = [word_to_ix.get(token, word_to_ix['<unk>']) for token in tokens][:max_length]
    input_ids = input_ids + [0] * (max_length - len(input_ids))
    attention_mask = [1 if token_id != 0 else 0 for token_id in input_ids]

    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long),
        'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
    }

In [16]:
word_to_ix = create_word_to_ix(df_train, df_test)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
src_pad_idx = word_to_ix['<pad>']

In [70]:
vocab_size = len(word_to_ix)
MAX_LEN = 512
model = MultilabelLocalAttentionSequenceClassificationTransformer(
            src_vocab_size=vocab_size,
            num_classes=len(emotions),
            src_pad_idx=src_pad_idx,
            emb_size=best_result["emb_size"],
            num_layers=best_result["num_layers"],
            forward_expansion=best_result["forward_expansion"],
            heads=best_result["heads"],
            device=device,
            max_len=MAX_LEN
        ).to(device)
PATH = "training_data\\transformer_scratch\\tr_la_save_dict.pth"
model.load_state_dict(torch.load(PATH))
model.eval()

MultilabelLocalAttentionSequenceClassificationTransformer(
  (encoder): EncoderLocalAttention(
    (word_embedding): Embedding(32477, 512)
    (position_embedding): Embedding(512, 512)
    (layers): ModuleList(
      (0-11): 12 x TransformerLocalAttentionBlock(
        (attention): LocalSelfAttention(
          (values): Linear(in_features=512, out_features=512, bias=True)
          (keys): Linear(in_features=512, out_features=512, bias=True)
          (queries): Linear(in_features=512, out_features=512, bias=True)
          (fc_out): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=4096, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=4096, out_features=512, bias=True)
        )
        (dropout): Dropout(p=0

In [103]:
def predict_label(text, word_to_ix, model, threshold=0.5):
    inputs = encode_text(text,word_to_ix)
    input_ids = inputs['input_ids'].unsqueeze(0).to(device)
    outputs= model(input_ids)
    print(outputs)
    logits = outputs[1].detach().cpu() # assuming batch size of 1
    probs = logits.sigmoid()
    probs = softmax(probs)
    probs_scaled = (probs - np.min(probs)) / (np.max(probs) - np.min(probs))
    print(probs_scaled)
    binary_preds = np.where(probs_scaled >= threshold, 1, 0)
    return binary_preds

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [104]:
text = 'My life sucks'

predict_label(text, word_to_ix, model)

(None, tensor([[-2.3949, -3.1236, -3.1996, -2.6819, -2.3831, -3.5813, -3.2771, -3.0328,
         -4.0401, -3.1618, -2.8680, -3.6086, -4.4933, -3.6042, -4.1691, -2.8212,
         -5.8767, -3.2586, -3.2068, -4.7127, -3.1683, -5.0598, -3.1115, -5.1245,
         -4.3853, -3.4118, -3.6269, -1.0069]], device='cuda:0',
       grad_fn=<AddmmBackward0>))


TypeError: max() received an invalid combination of arguments - got (axis=NoneType, out=NoneType, ), but expected one of:
 * ()
 * (Tensor other)
 * (int dim, bool keepdim)
      didn't match because some of the keywords were incorrect: axis, out
 * (name dim, bool keepdim)
      didn't match because some of the keywords were incorrect: axis, out
