In [2]:
import json
import pickle
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def load_model(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [4]:
# load data

train_df = load_dataset('sagteam/cedr', name='enriched', split='train')
test_df = load_dataset('sagteam/cedr', name='enriched', split='test')

with open('../data/emo_lexicon.json') as f:
    emo_lexicon_dict = json.load(f)

Using the latest cached version of the module from /home/aleksandr/.cache/huggingface/modules/datasets_modules/datasets/cedr/3206e7f85b26ab12b6269276b8afcc3bcb11971fca7d7e9154ab2c21dc90dfc0 (last modified on Fri Aug 13 12:30:56 2021) since it couldn't be found locally at /home/aleksandr/Документы/active/CEDR_0_1_1/notebooks/sagteam/cedr/cedr.py, or remotely (ImportError).
Exception ignored in: <function tqdm.__del__ at 0x7f649a87f6a8>
Traceback (most recent call last):
  File "/home/aleksandr/anaconda3/envs/git_cedr_v001/lib/python3.7/site-packages/tqdm/std.py", line 1152, in __del__
    self.close()
  File "/home/aleksandr/anaconda3/envs/git_cedr_v001/lib/python3.7/site-packages/tqdm/notebook.py", line 283, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'
Reusing dataset cedr (/home/aleksandr/.cache/huggingface/datasets/cedr/enriched/0.1.1/3206e7f85b26ab12b6269276b8afcc3bcb11971fca7d7e9154ab2c21dc90dfc0)
Using the late

In [5]:
len(train_df), len(test_df), len(train_df)+len(test_df)

(7528, 1882, 9410)

In [11]:
n0, n1, n2, n3, n4, n5 = 0,0,0,0,0,0
for doc in list(train_df)+list(test_df):
    if 'lenta' in doc['source']:
        if doc['labels'] == []:
            n0+=1
        if 'joy' in doc['labels']:
            n1+=1
        if 'sadness' in doc['labels']:
            n2+=1
        if 'fear' in doc['labels']:
            n3+=1
        if 'anger' in doc['labels']:
            n4+=1
        if 'surprise' in doc['labels']:
            n5+=1
n0, n1, n2, n3, n4, n5

(2162, 185, 89, 115, 112, 188)

In [12]:
n0, n1, n2, n3, n4, n5 = 0,0,0,0,0,0
for doc in list(train_df)+list(test_df):
    if 'lj' in doc['source']:
        if doc['labels'] == []:
            n0+=1
        if 'joy' in doc['labels']:
            n1+=1
        if 'sadness' in doc['labels']:
            n2+=1
        if 'fear' in doc['labels']:
            n3+=1
        if 'anger' in doc['labels']:
            n4+=1
        if 'surprise' in doc['labels']:
            n5+=1
n0, n1, n2, n3, n4, n5

(1498, 437, 304, 265, 232, 397)

In [13]:
n0, n1, n2, n3, n4, n5 = 0,0,0,0,0,0
for doc in list(train_df)+list(test_df):
    if 'twitter' in doc['source']:
        if doc['labels'] == []:
            n0+=1
        if 'joy' in doc['labels']:
            n1+=1
        if 'sadness' in doc['labels']:
            n2+=1
        if 'fear' in doc['labels']:
            n3+=1
        if 'anger' in doc['labels']:
            n4+=1
        if 'surprise' in doc['labels']:
            n5+=1
n0, n1, n2, n3, n4, n5

(116, 1300, 1403, 350, 192, 192)

In [14]:
# Random approach

print('Random approach:\n')

for emo_label in emo_lexicon_dict.keys():
    true_y = []
    
    for sample in test_df:
        if emo_label in sample['labels']:
            true_y.append(1)
        else:
            true_y.append(0)
            
    # The emotion label is chosen randomly for each sentence
    pred_y = np.random.randint(0,2,len(test_df))
    
    # The accuracy of the obtained models is measured with the F1 metric
    p_micro, r_micro, f_micro, _ = metrics.precision_recall_fscore_support(true_y, pred_y, average="micro")
    p_macro, r_macro, f_macro, _ = metrics.precision_recall_fscore_support(true_y, pred_y, average="macro")
    
    print(f'Emotion "{emo_label}":')
    print(f'mic.: {round(f_micro, 2)};\t mac.:{round(f_macro, 2)}\n')

Random approach:

Emotion "surprise":
mic.: 0.51;	 mac.:0.41

Emotion "fear":
mic.: 0.5;	 mac.:0.39

Emotion "joy":
mic.: 0.51;	 mac.:0.46

Emotion "sadness":
mic.: 0.52;	 mac.:0.47

Emotion "anger":
mic.: 0.47;	 mac.:0.37



In [15]:
# Lexicon approach

print('Lexicon approach:\n')

for emo_label in emo_lexicon_dict.keys():
    true_y, pred_y = [], []

    for sample in test_df:
        if emo_label in sample['labels']:
            true_y.append(1)
        else:
            true_y.append(0)
        
        # The emotion label is determined by thepresence of words from 
        # the emotive vocabulary for the corresponding emotion
        sample_lemms = [word['lemma'].lower() for sentence in sample['sentences'] for word in sentence]        
        if any(word in emo_lexicon_dict[emo_label] for word in sample_lemms):
            pred_y.append(1)
        else:
            pred_y.append(0)
    
    # The accuracy of the obtained models is measured with the F1 metric
    p_micro, r_micro, f_micro, _ = metrics.precision_recall_fscore_support(true_y, pred_y, average="micro")
    p_macro, r_macro, f_macro, _ = metrics.precision_recall_fscore_support(true_y, pred_y, average="macro")
    
    print(f'Emotion "{emo_label}":')
    print(f'mic.: {round(f_micro, 2)};\t mac.:{round(f_macro, 2)}\n')

Lexicon approach:

Emotion "surprise":
mic.: 0.88;	 mac.:0.76

Emotion "fear":
mic.: 0.83;	 mac.:0.68

Emotion "joy":
mic.: 0.83;	 mac.:0.73

Emotion "sadness":
mic.: 0.71;	 mac.:0.62

Emotion "anger":
mic.: 0.76;	 mac.:0.57



In [16]:
# SVM + (TF-IDF)

print('SVM(TF-IDF) model:\n')

# TF-IDF features
vect = TfidfVectorizer(analyzer='char', ngram_range=(4,8))
texts_for_fit = [sample['text'].lower() for sample in train_df]
texts_for_fit += [sample['text'].lower() for sample in test_df]
vect.fit(texts_for_fit)

# SVM model with linear kernel
model = LinearSVC(random_state=42)

for emo_label in emo_lexicon_dict.keys():
    train_x, test_x = [], []
    train_y, true_y = [], []
    pred_y = []
    
    for sample in train_df:
        train_x.append(sample['text'].lower())
        
        if emo_label in sample['labels']:
            train_y.append(1)
        else:
            train_y.append(0)

    train_x = vect.transform(train_x)
    model.fit(train_x, train_y)
    
    for sample in test_df:
        test_x.append(sample['text'].lower())
        
        if emo_label in sample['labels']:
            true_y.append(1)
        else:
            true_y.append(0)
    
    test_x = vect.transform(test_x)
    pred_y = model.predict(test_x)
    
    # The accuracy of the obtained models is measured with the F1 metric
    p_micro,r_micro,f_micro,_ = metrics.precision_recall_fscore_support(true_y, pred_y, average="micro")
    p_macro,r_macro,f_macro,_ = metrics.precision_recall_fscore_support(true_y, pred_y, average="macro")
    
    print(f'Emotion "{emo_label}":')
    print(f'mic.: {round(f_micro, 2)};\t mac.:{round(f_macro, 2)}\n')

SVM(TF-IDF) model:

Emotion "surprise":
mic.: 0.93;	 mac.:0.67

Emotion "fear":
mic.: 0.94;	 mac.:0.66

Emotion "joy":
mic.: 0.86;	 mac.:0.67

Emotion "sadness":
mic.: 0.86;	 mac.:0.71

Emotion "anger":
mic.: 0.93;	 mac.:0.5



In [26]:
# Our approach (ensemble + ELMO embedding vectors)

print('Our approach:\n')

# After the adjustment and comparison of the classifiers, 
# the final solution is an ensemble of the following five binaryclassifiers:

# 1. A model based on gradient descent and logistic regression methods, 
# with preliminarynormalization of the data: the ratio of the difference in value and 
# its mean to standard deviation
surprise_model = load_model('../models/surprise_model.pkl')

# 2. A model based on stochastic gradient descent with PCA preprocessing of input features
fear_model = load_model('../models/fear_model.pkl')

# 3. A model based on logistic regression
sad_model = load_model('../models/sad_model.pkl')

# 4. A model based on a support vector machine with a linear kernel
joy_model = load_model('../models/joy_model.pkl')

# 5. A model based on logistic regression
anger_model = load_model('../models/anger_model.pkl')

# load ELMO embedding vectors
df = load_model('../data/elmo_vec.pkl')

for emo_label in emo_lexicon_dict.keys():
    if emo_label == 'surprise':
        model = surprise_model
    elif emo_label == 'fear':
        model = fear_model
    elif emo_label == 'sadness':
        model = sad_model
    elif emo_label == 'joy':
        model = joy_model
    elif emo_label == 'anger':
        model = anger_model
        
    train_x, train_y = [], []
    test_x, test_y = [], []
    
    for sample in df['train']:
        train_x.append(sample['vec'])
        if emo_label in sample['labels']:
            train_y.append(1)
        else:
            train_y.append(0)

    for sample in df['test']:
        test_x.append(sample['vec'])
        if emo_label in sample['labels']:
            test_y.append(1)
        else:
            test_y.append(0)
    
    pred_y = model.predict(np.array(test_x))

    p_micro,r_micro,f_micro,_ = metrics.precision_recall_fscore_support(test_y, pred_y, average="micro")
    p_macro,r_macro,f_macro,_ = metrics.precision_recall_fscore_support(test_y, pred_y, average="macro")
    
    print(f'Emotion "{emo_label}":')
    print(f'mic.: {round(f_micro, 2)};\t mac.:{round(f_macro, 2)}\n')

Our approach:

Emotion "surprise":
mic.: 0.93;	 mac.:0.76

Emotion "fear":
mic.: 0.93;	 mac.:0.73

Emotion "joy":
mic.: 0.92;	 mac.:0.87

Emotion "sadness":
mic.: 0.92;	 mac.:0.86

Emotion "anger":
mic.: 0.9;	 mac.:0.62



In [20]:
print("Successful complete")

Successful complete
