## Data Collection steps

In [12]:
import pandas as pd

# Download and load the train data:
train_data_url = 'https://raw.githubusercontent.com/google-research/google-research/2adf640a14f11025ae5a9d0ec493b78530d276d3/goemotions/data/train.tsv'

# Load the files into dataframes
train_data = pd.read_csv(train_data_url, sep='\t')

# Feature Engineering

In [13]:
# comment will be the only feature
# emotion will be the target (multiple labels)
header = ["comment", "emotion", "id"]
train_data.columns = header
train_data.head(2)

Unnamed: 0,comment,emotion,id
0,"Now if he does off himself, everyone will thin...",27,ed00q6i
1,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj


### Removing instances with more than one emotion

In [14]:
# Remove instances with more than one emotion from each dataset
train_data = train_data[train_data['emotion'].apply(lambda x: len(x.split(',')) == 1)]

### Convert emotion column into integers

In [15]:
# Convert emotion column into integers
train_data['emotion'] = train_data['emotion'].apply(lambda x: ''.join(filter(str.isdigit, str(x)))).astype(int)

### Loading the BERT tokenizer to process the comments

In [16]:
# making features out of the comment column
# we tokenize the comments
from transformers import BertTokenizer

# Load the BERT tokenizer
# we will use the bert-base-uncased tokenizer
# this tokenizer will tokenize the comments
# and convert them into tokens
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### Converting Comments into Tokens

In [17]:
# Tokenize the comments
train_data['tokenized_comments'] = train_data['comment'].apply(
    lambda x: tokenizer.encode(x)
)

train_data.head(2)

Unnamed: 0,comment,emotion,id,tokenized_comments
0,"Now if he does off himself, everyone will thin...",27,ed00q6i,"[101, 2085, 2065, 2002, 2515, 2125, 2370, 1010..."
1,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,"[101, 2339, 1996, 6616, 2003, 3016, 3238, 1116..."


### Here's another way to do it

In [18]:
# Using the tokenizer function

#### PADDING means that we add padding to the tokens
# for example
# if we have the tokens [1, 2, 3, 4, 5]
# and we want to pad them to the length of 10
# we will add 5 padding tokens to the tokens
# so the tokens will look like this: [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]

#### TRUNCATION means that we remove tokens from the tokens
# for example
# if we have the tokens [1, 2, 3, 4, 5]
# and we want to truncate them to the length of 3
# we will remove the last 2 tokens
# so the tokens will look like this: [1, 2, 3]

# Why do we need PADDING and TRUNCATION?
# The BERT model requires that all the input sequences have the same length
# we can achieve this by either padding or truncating the sequences
# we can also use a combination of both
# for example, using both
# we can pad the sequences to a certain length
# and if the sequence is longer than the maximum length
# we can truncate the sequence to the maximum length

#### RETURN_TENSORS means that we want the output to be a PyTorch tensor

tokenized_comments = tokenizer(train_data['comment'].to_list(), padding=True, truncation=True, return_tensors='pt')

In [19]:
#### TOKENIZED_COMMENTS is a dictionary
# it contains the INPUT_IDS, ATTENTION_MASK, and TOKEN_TYPE_IDS
# input_ids are the tokenized comments

#### ATTENTION_MASK is a tensor that has the same length as the input_ids
# it contains 1s where the input_ids are and 0s where the padding tokens are

#### TOKEN_TYPE_IDS is a tensor that has the same length as the input_ids
# it contains 0s where the first sentence is and 1s where the second sentence is
# since we only have one sentence, all the values are 0s

# So it separates the text into sentences?
# Yes, it separates the text into sentences
# but since we only have one sentence, all the values are 0s

tokenized_comments

{'input_ids': tensor([[ 101, 2085, 2065,  ...,    0,    0,    0],
        [ 101, 2339, 1996,  ...,    0,    0,    0],
        [ 101, 2000, 2191,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2062, 2066,  ...,    0,    0,    0],
        [ 101, 5959, 1996,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

### Features and target

In [20]:
from sklearn.model_selection import train_test_split

# These will be the features
X_train = tokenized_comments['input_ids'].numpy()
# And these will be the targets
y_train = train_data['emotion'].values

###########
# We don't split the data into training and testing sets since they are already split

# # Define stratified shuffle split
# from sklearn.model_selection import StratifiedShuffleSplit
# strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# # Split the data into training and testing sets
# for train_index, test_index in strat_split.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

# print("Training set class distribution:")
# print(np.bincount(y_train))
# print("Testing set class distribution:")
# print(np.bincount(y_test))
###########

# So this will be the dataset that we will use
print("X = ", X_train)
print("y = ", y_train)

X =  [[ 101 2085 2065 ...    0    0    0]
 [ 101 2339 1996 ...    0    0    0]
 [ 101 2000 2191 ...    0    0    0]
 ...
 [ 101 2054 2024 ...    0    0    0]
 [ 101 2062 2066 ...    0    0    0]
 [ 101 5959 1996 ...    0    0    0]]
y =  [27  2 14 ...  3 13 17]
