In [1]:
import json
import pickle
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def load_model(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [3]:
# load data

train_df = load_dataset('sagteam/cedr', name='enriched', split='train')
test_df = load_dataset('sagteam/cedr', name='enriched', split='test')

with open('../data/emo_lexicon.json') as f:
    emo_lexicon_dict = json.load(f)

Downloading:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

Downloading and preparing dataset cedr/enriched (download: 1.74 MiB, generated: 5.70 MiB, post-processed: Unknown size, total: 7.44 MiB) to /home/aleksandr/.cache/huggingface/datasets/cedr/enriched/0.1.1/715639b8fdb9faa0063aa2b7b1b5283518253c296619c1646aed66583406acf7...


Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset cedr downloaded and prepared to /home/aleksandr/.cache/huggingface/datasets/cedr/enriched/0.1.1/715639b8fdb9faa0063aa2b7b1b5283518253c296619c1646aed66583406acf7. Subsequent calls will reuse this data.


Reusing dataset cedr (/home/aleksandr/.cache/huggingface/datasets/cedr/enriched/0.1.1/715639b8fdb9faa0063aa2b7b1b5283518253c296619c1646aed66583406acf7)


In [4]:
# Random approach

print('Random approach:\n')

for emo_label in emo_lexicon_dict.keys():
    true_y = []
    
    for sample in test_df:
        if emo_label in sample['labels']:
            true_y.append(1)
        else:
            true_y.append(0)
            
    # The emotion label is chosen randomly for each sentence
    pred_y = np.random.randint(0,2,len(test_df))
    
    # The accuracy of the obtained models is measured with the F1 metric
    p_micro, r_micro, f_micro, _ = metrics.precision_recall_fscore_support(true_y, pred_y, average="micro")
    p_macro, r_macro, f_macro, _ = metrics.precision_recall_fscore_support(true_y, pred_y, average="macro")
    
    print(f'Emotion "{emo_label}":')
    print(f'mic.: {round(f_micro, 2)};\t mac.:{round(f_macro, 2)}\n')

Random approach:

Emotion "surprise":
mic.: 0.51;	 mac.:0.41

Emotion "fear":
mic.: 0.49;	 mac.:0.39

Emotion "joy":
mic.: 0.49;	 mac.:0.44

Emotion "sadness":
mic.: 0.49;	 mac.:0.44

Emotion "anger":
mic.: 0.5;	 mac.:0.39



In [5]:
# Lexicon approach

print('Lexicon approach:\n')

for emo_label in emo_lexicon_dict.keys():
    true_y, pred_y = [], []

    for sample in test_df:
        if emo_label in sample['labels']:
            true_y.append(1)
        else:
            true_y.append(0)
        
        # The emotion label is determined by thepresence of words from 
        # the emotive vocabulary for the corresponding emotion
        sample_lemms = [word['lemma'].lower() for sentence in sample['sentences'] for word in sentence]        
        if any(word in emo_lexicon_dict[emo_label] for word in sample_lemms):
            pred_y.append(1)
        else:
            pred_y.append(0)
    
    # The accuracy of the obtained models is measured with the F1 metric
    p_micro, r_micro, f_micro, _ = metrics.precision_recall_fscore_support(true_y, pred_y, average="micro")
    p_macro, r_macro, f_macro, _ = metrics.precision_recall_fscore_support(true_y, pred_y, average="macro")
    
    print(f'Emotion "{emo_label}":')
    print(f'mic.: {round(f_micro, 2)};\t mac.:{round(f_macro, 2)}\n')

Lexicon approach:

Emotion "surprise":
mic.: 0.88;	 mac.:0.76

Emotion "fear":
mic.: 0.83;	 mac.:0.68

Emotion "joy":
mic.: 0.83;	 mac.:0.73

Emotion "sadness":
mic.: 0.71;	 mac.:0.62

Emotion "anger":
mic.: 0.76;	 mac.:0.57



In [6]:
# SVM + (TF-IDF)

print('SVM(TF-IDF) model:\n')

# TF-IDF features
vect = TfidfVectorizer(analyzer='char', ngram_range=(4,8))
texts_for_fit = [sample['text'].lower() for sample in train_df]
texts_for_fit += [sample['text'].lower() for sample in test_df]
vect.fit(texts_for_fit)

# SVM model with linear kernel
model = LinearSVC(random_state=42)

for emo_label in emo_lexicon_dict.keys():
    train_x, test_x = [], []
    train_y, true_y = [], []
    pred_y = []
    
    for sample in train_df:
        train_x.append(sample['text'].lower())
        
        if emo_label in sample['labels']:
            train_y.append(1)
        else:
            train_y.append(0)

    train_x = vect.transform(train_x)
    model.fit(train_x, train_y)
    
    for sample in test_df:
        test_x.append(sample['text'].lower())
        
        if emo_label in sample['labels']:
            true_y.append(1)
        else:
            true_y.append(0)
    
    test_x = vect.transform(test_x)
    pred_y = model.predict(test_x)
    
    # The accuracy of the obtained models is measured with the F1 metric
    p_micro,r_micro,f_micro,_ = metrics.precision_recall_fscore_support(true_y, pred_y, average="micro")
    p_macro,r_macro,f_macro,_ = metrics.precision_recall_fscore_support(true_y, pred_y, average="macro")
    
    print(f'Emotion "{emo_label}":')
    print(f'mic.: {round(f_micro, 2)};\t mac.:{round(f_macro, 2)}\n')

SVM(TF-IDF) model:

Emotion "surprise":
mic.: 0.93;	 mac.:0.67

Emotion "fear":
mic.: 0.94;	 mac.:0.66

Emotion "joy":
mic.: 0.86;	 mac.:0.67

Emotion "sadness":
mic.: 0.86;	 mac.:0.71

Emotion "anger":
mic.: 0.93;	 mac.:0.5



In [7]:
# Our approach (ensemble + ELMO embedding vectors)

print('Our approach:\n')

# After the adjustment and comparison of the classifiers, 
# the final solution is an ensemble of the following five binaryclassifiers:

# 1. A model based on gradient descent and logistic regression methods, 
# with preliminarynormalization of the data: the ratio of the difference in value and 
# its mean to standard deviation
surprise_model = load_model('../models/surprise_model.pkl')

# 2. A model based on stochastic gradient descent with PCA preprocessing of input features
fear_model = load_model('../models/fear_model.pkl')

# 3. A model based on logistic regression
sad_model = load_model('../models/sad_model.pkl')

# 4. A model based on a support vector machine with a linear kernel
joy_model = load_model('../models/joy_model.pkl')

# 5. A model based on logistic regression
anger_model = load_model('../models/anger_model.pkl')

# load ELMO embedding vectors
df = load_model('../data/elmo_vec.pkl')

for emo_label in emo_lexicon_dict.keys():
    if emo_label == 'surprise':
        model = surprise_model
    elif emo_label == 'fear':
        model = fear_model
    elif emo_label == 'sadness':
        model = sad_model
    elif emo_label == 'joy':
        model = joy_model
    elif emo_label == 'anger':
        model = anger_model
        
    train_x, train_y = [], []
    test_x, test_y = [], []
    
    for sample in df['train']:
        train_x.append(sample['vec'])
        if emo_label in sample['labels']:
            train_y.append(1)
        else:
            train_y.append(0)

    for sample in df['test']:
        test_x.append(sample['vec'])
        if emo_label in sample['labels']:
            test_y.append(1)
        else:
            test_y.append(0)
    
    pred_y = model.predict(np.array(test_x))

    p_micro,r_micro,f_micro,_ = metrics.precision_recall_fscore_support(test_y, pred_y, average="micro")
    p_macro,r_macro,f_macro,_ = metrics.precision_recall_fscore_support(test_y, pred_y, average="macro")
    
    print(f'Emotion "{emo_label}":')
    print(f'mic.: {round(f_micro, 2)};\t mac.:{round(f_macro, 2)}\n')

Our approach:

Emotion "surprise":
mic.: 0.93;	 mac.:0.76

Emotion "fear":
mic.: 0.93;	 mac.:0.73

Emotion "joy":
mic.: 0.92;	 mac.:0.87

Emotion "sadness":
mic.: 0.92;	 mac.:0.86

Emotion "anger":
mic.: 0.9;	 mac.:0.62



In [8]:
print("Successful complete")

Successful complete
