In [13]:
import warnings

import pandas as pd
import numpy as np
import sklearn
import nltk
import spacy
import re
import wandb
from datasets import load_dataset
from tqdm import tqdm
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

warnings.filterwarnings("ignore", category=ConvergenceWarning)
wandb.errors.term._show_warnings = False

In [5]:
wandb.login()

True

The dataset that i am using has two configurations:

| Name    | Train | Validation | Test |
|---------|-------|------------|------|
| Split   | 16000 | 2000       | 2000 |
| Unsplit | 416809| n/a        | n/a  |

I will be using both configurations and test with a smaller corpus for training and then a bigger one.

In [7]:
splitted_ds = load_dataset("dair-ai/emotion", "split")
# unsplitted_ds = load_dataset("dair-ai/emotion", "unsplit")

# df_unsplit_train = unsplitted_ds['train'].to_pandas()
df_train = splitted_ds['train'].to_pandas()
df_test = splitted_ds['test'].to_pandas()
df_validation = splitted_ds['validation'].to_pandas()

# Data Fields
The data fields are:

**text**: a string feature.|

**label**: a classification label, with possible values including: 

0 -> sadness

1 -> joy

2 -> love

3 -> anger

4 -> fear

5 -> surprise

In [8]:
df_train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


# The number of data for each label. It can be seen that the data is a little unbalanced in the splitted training dataset. Same story applies to unsplitted dataset

In [9]:
df_train['label'].value_counts()

label
1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: count, dtype: int64

In [23]:
df_unsplit_train['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

# SnowballStemmer:

- After processing the word through all these rules, the stemmer produces a stem—a simplified version of the word that represents its core meaning. This stem is not always a valid word in the language but is a useful representation for analysis purposes.
- For example, “running” becomes “run,” “studies” becomes “studi,” and “better” becomes “better” (sometimes the word is already in its simplest form).

In [10]:
import gensim.downloader as api

list(api.info()['models'].keys())

['fasttext-wiki-news-subwords-300',
 'conceptnet-numberbatch-17-06-300',
 'word2vec-ruscorpora-300',
 'word2vec-google-news-300',
 'glove-wiki-gigaword-50',
 'glove-wiki-gigaword-100',
 'glove-wiki-gigaword-200',
 'glove-wiki-gigaword-300',
 'glove-twitter-25',
 'glove-twitter-50',
 'glove-twitter-100',
 'glove-twitter-200',
 '__testing_word2vec-matrix-synopsis']

In [11]:
ss = nltk.stem.snowball.SnowballStemmer("english")
sw = nltk.corpus.stopwords.words('english')
nlp = spacy.load('en_core_web_sm')  # english tokenizer trf -> accuracy | sm -> efficiency
word2vec = api.load("word2vec-google-news-300")  # Load the pretrained Word2Vec model
print("models imported!")

models imported!


In [20]:
def text_preparetion(sentence, nlp):
    # 1. Lowercase everything
    sentence = sentence.lower()

    # 2. Remove all symbols other than a-z@#.
    sentence = re.sub(r"[^a-zăâîșț@# ]", "", sentence)

    # # Tokenize the preprocessed sentence
    tokenization = nlp(sentence)

    # 4. Remove stopwords and empty tokens and split sentence into words
    list_text_preprocessed = [
        word.text for word in tokenization if word.text not in sw and word.pos_ != "SPACE"
    ]
    
    return ' '.join(list_text_preprocessed)


def text_vectorization_word2vec(sentence, model):
    words = sentence.split()
    words_embeddings = [model[word] for word in words if word in model]
    
    # if there are no words in the word2vec
    if not words_embeddings:
        return np.zeros(model.vector_size)
    
    # Average the word vectors to get a single sentece represenation
    return np.mean(words_embeddings, axis=0)

def text_vectorization_word2vec_weighted(sentence, model, train_tfidf_dict):
    words = sentence.split()
    words_embeddings = []
    
    for word in words:
        weight = train_tfidf_dict.get(word, 1.0)
        if word in model:
            words_embeddings.append(weight * model[word])
    
    # if there are no words in the word2vec
    if not words_embeddings:
        return np.zeros(model.vector_size)
    
    # Average the word vectors to get a single sentece represenation
    return np.mean(words_embeddings, axis=0)

tqdm.pandas()

# Preprocessing
df_train['text'] = df_train['text'].progress_apply(lambda x: text_preparetion(x, nlp))
df_test['text'] = df_test['text'].progress_apply(lambda x: text_preparetion(x, nlp))
df_validation['text'] = df_validation['text'].progress_apply(lambda x: text_preparetion(x, nlp))
print("PREPROCESSING!")

# TF-IDF
vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()
X_train_tfidf = vectorizer.fit_transform(df_train['text'])
train_tfidf_dict = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
X_val_tfidf = vectorizer.transform(df_validation['text'])
X_test_tfidf = vectorizer.transform(df_test['text'])
print("TF-IDF!")

# word2vec
df_train['embeddings'] = df_train['text'].progress_apply(lambda x: text_vectorization_word2vec(x, word2vec))
df_test['embeddings'] = df_test['text'].progress_apply(lambda x: text_vectorization_word2vec(x, word2vec))
df_validation['embeddings'] = df_validation['text'].progress_apply(lambda x: text_vectorization_word2vec(x, word2vec))
print("WORD2VEC!")

# weighted word2vec
df_train['weighted_embeddings'] = df_train['text'].progress_apply(lambda x: text_vectorization_word2vec_weighted(x, word2vec, train_tfidf_dict))
df_test['weighted_embeddings'] = df_test['text'].progress_apply(lambda x: text_vectorization_word2vec_weighted(x, word2vec, train_tfidf_dict))
df_validation['weighted_embeddings'] = df_validation['text'].progress_apply(lambda x: text_vectorization_word2vec_weighted(x, word2vec, train_tfidf_dict))
print("WEIGHTED WORD2VEC!")
# df_unsplit_train['embeddings'] = df_unsplit_train['text'].progress_apply(lambda x: text_preparetion(x, word2vec))

100%|██████████| 16000/16000 [00:36<00:00, 444.22it/s]
100%|██████████| 2000/2000 [00:04<00:00, 445.64it/s]
100%|██████████| 2000/2000 [00:04<00:00, 448.50it/s]
100%|██████████| 16000/16000 [00:00<00:00, 61255.47it/s]
100%|██████████| 2000/2000 [00:00<00:00, 60969.92it/s]
100%|██████████| 2000/2000 [00:00<00:00, 61689.11it/s]
100%|██████████| 16000/16000 [00:00<00:00, 39502.55it/s]
100%|██████████| 2000/2000 [00:00<00:00, 36999.86it/s]
100%|██████████| 2000/2000 [00:00<00:00, 36344.37it/s]


In [17]:
# save preprocessed dataset
df_train.to_csv("./data/split_train.csv", index=False)
df_test.to_csv("./data/test.csv", index=False)
df_validation.to_csv("./data/validation.csv", index=False)
# df_unsplit_train.to_csv("./data/unsplit_train.csv", index=False)

In [18]:
# load preprocessed datasets
df_train = pd.read_csv("./data/split_train.csv")
df_test = pd.read_csv("./data/test.csv")
df_validation = pd.read_csv("./data/validation.csv")
# df_unsplit_train = pd.read_csv("./data/unsplit_train.csv")

In [21]:
df_train[:100]

Unnamed: 0,text,label,embeddings,weighted_embeddings
0,nt feel humiliated,0,"[-0.19498698, 0.1408081, 0.061035156, -0.08772...","[-0.7648797, 0.81191665, 0.64800817, -0.506402..."
1,go feeling hopeless damned hopeful around some...,0,"[0.10611979, -0.01570638, 0.005818685, 0.07367...","[0.6533486, -0.23902734, 0.17611901, 0.4855068..."
2,grabbing minute post feel greedy wrong,3,"[0.045369465, 0.06301626, -0.105163574, 0.0296...","[0.41217908, 0.46653095, -0.67341155, 0.289130..."
3,ever feeling nostalgic fireplace know still pr...,2,"[0.12252372, 0.025983538, 0.008736746, 0.06814...","[0.9110772, 0.13422604, 0.13133731, 0.53929806..."
4,feeling grouchy,3,"[0.18334961, 0.21044922, -0.14233398, -0.03942...","[1.0802882, 1.1289341, -0.57756287, 0.03978543..."
...,...,...,...,...
95,feel like throwing away shitty piece shit paper,0,"[0.092681885, 0.009773254, -0.048070908, 0.111...","[0.69135594, 0.035815842, -0.21879485, 0.70745..."
96,starting feel wryly amused banal comedy errors...,1,"[0.048014324, 0.08087158, 0.0011461047, 0.0941...","[0.506407, 0.5669559, 0.09637899, 0.76437205, ..."
97,find every body beautiful want people feel vit...,1,"[0.025824653, -0.019510904, 0.05770535, 0.0679...","[0.063290104, -0.14269626, 0.4970071, 0.387071..."
98,hear owners feel victimized associations assoc...,0,"[-0.024902344, -0.024559868, 0.005533854, 0.00...","[-0.23187801, -0.25392118, 0.14278063, 0.09517..."


In [23]:
X_train = df_train['embeddings'].to_numpy()
X_train = np.vstack(X_train)

Y_train = df_train['label']

X_val = df_validation['embeddings'].to_numpy()
X_val = np.vstack(X_val)
Y_val = df_validation['label']

X_test = df_test['embeddings'].to_numpy()
X_test = np.vstack(X_test)
Y_test = df_test['label']

In [25]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

svm = SVC(verbose=1, probability=True)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
}

# Use StratifiedKFold for cross-validation
kf = StratifiedKFold(n_splits=4)

# GridSearchCV to find the best parameters
grid_search = GridSearchCV(svm, param_grid, cv=kf, n_jobs=8, scoring="f1_weighted", verbose=1)
grid_search.fit(X_train_tfidf, Y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


KeyboardInterrupt: 

In [None]:
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(X_val)
val_probas = best_model.predict_proba(X_val)

In [45]:
from sklearn.metrics import classification_report

report = classification_report(Y_val, val_predictions)
print(report)

              precision    recall  f1-score   support

           0       0.54      0.80      0.64       550
           1       0.71      0.66      0.68       704
           2       0.68      0.37      0.47       178
           3       0.58      0.53      0.55       275
           4       0.68      0.42      0.52       212
           5       0.72      0.36      0.48        81

    accuracy                           0.62      2000
   macro avg       0.65      0.52      0.56      2000
weighted avg       0.64      0.62      0.61      2000



In [75]:
run = wandb.init(project='Emotion', name="svm-classification2")

In [72]:
labels = df_train.label.unique()
labels.sort()
labels

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [64]:
wandb.sklearn.plot_classifier(best_model,
                              X_train_tfidf, X_val_tfidf,
                              Y_train, Y_val,
                              val_predictions, val_probas,
                              labels,
                              is_binary=False,
                              model_name='SVM')

wandb.finish()

wandb: 
wandb: Plotting SVM.
wandb: Logged feature importances.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision-recall curve.


VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [66]:
Y_val.value_counts()

label
1    704
0    550
3    275
4    212
2    178
5     81
Name: count, dtype: int64

In [68]:
val_probas

array([[9.97940159e-01, 8.94932782e-06, 1.14015911e-05, 9.71676376e-04,
        8.01150403e-04, 2.66663028e-04],
       [9.99007582e-01, 1.04310969e-05, 2.08927982e-05, 3.47123559e-04,
        2.90308696e-04, 3.23661824e-04],
       [4.49186376e-05, 1.07201915e-01, 8.92726273e-01, 1.72892243e-05,
        3.60782549e-06, 5.99659767e-06],
       ...,
       [1.80117452e-04, 9.91505307e-01, 6.46288024e-03, 6.11106504e-04,
        5.20248034e-04, 7.20340519e-04],
       [3.02788136e-04, 6.66461139e-01, 3.32174979e-01, 2.19534473e-04,
        5.96615042e-04, 2.44944195e-04],
       [2.05086383e-03, 9.83388117e-01, 2.39579210e-03, 2.11680342e-03,
        8.02862173e-03, 2.01980226e-03]])

In [76]:
wandb.sklearn.plot_roc(Y_val, val_probas, labels)

In [77]:
wandb.sklearn.plot_learning_curve(best_model, X_train_tfidf, Y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]