In [2]:
import os
import sys
import re
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [3]:

def load_gold_file(input_file):
    '''
    Line structure is "sentiment \t tweet"
    '''
    senti_dict = {
        'negative':0,
        'neutral':1,
        'positive':2,
    }
    
    file_path = os.path.expanduser(input_file)
    sentiments = []
    with open(file_path, 'r') as file:
        for line in file:
            tokens = re.split(r'\t',line)
            sentiments.append(senti_dict[tokens[0]])
    gold = np.array(sentiments)
    
    return gold

def load_probabilities_results(input_file):
    results = pd.read_csv(input_file,
                           sep="\t",
                           header=None,
                           names=["negative", "neutral", "positive"])
    probabilities = results.to_numpy()
    predictions = probabilities.argmax(axis=1)
    return predictions, probabilities

def load_predictions_results(input_file):
    file_path = os.path.expanduser(input_file)
    results, labels = [], []
    with open(file_path, "r") as re_file:
        for line in re_file:
            tokens = re.split(r'\t', line)
            results.append(int(tokens[0]))
            labels.append(int(tokens[1]))
    predictions = np.array(results)
    gold_labels = np.array(labels)
        
    return predictions, gold_labels

def compute_semeval_metrics(gold, predictions):
    
    def _multilabel_recall(index, cmtx):
        '''
        Recall is defined as the proportion between correctly classified relevant classes and 
        all the known relevant classes.
        recall = TP / TP + FN
        '''
        true_gold = cmtx.iloc[index, index]
        all_gold = np.sum(cmtx.iloc[index,:].to_numpy())
        return true_gold / all_gold
    
    def _multilabel_precision(index, cmtx):
        '''
        Precision is defined as the proportion between correctly classified cases and all the classified cases of
        class.
        recall = TP / TP + FP
        '''
        true_pred = cmtx.iloc[index, index]
        false_pred = np.sum(cmtx.iloc[:,index].to_numpy())
        return true_pred / false_pred
    
    cmtx = pd.DataFrame(
        confusion_matrix(gold, predictions, labels=[0,1,2]), 
        index=['gold:negative', 'gold:neutral', 'gold:positive'], 
        columns=['pred:negative', 'pred:neutral', 'pred:positive']
    )
    
    #accuracy
    acc = accuracy_score(gold, predictions)
    
    #recall
    negative_recall = _multilabel_recall(0, cmtx)
    neutral_recall = _multilabel_recall(1, cmtx)
    positive_recall = _multilabel_recall(2, cmtx)
    avg_r = (negative_recall + neutral_recall + positive_recall) / 3
    
    #precision
    negative_precision = _multilabel_precision(0, cmtx)
    positive_precision = _multilabel_precision(2, cmtx)
    
    #f1
    negative_f1 = (2*negative_precision*negative_recall) / (negative_precision+negative_recall)
    positive_f1 = (2*positive_precision*positive_recall) / (positive_precision+positive_recall)
    f1_pn = (positive_f1 + negative_f1) / 2
    
    
    print('*******CONFUSION MATRIX*******')
    print(cmtx)
    print('*******EVALUATION METRICS********')
    print('Average recall: ', avg_r)
    print('F1_pn = ', f1_pn)
    print("Accuracy: ", acc)
    
def evaluate_test_file(test_results_file, mode='probs', gold_file='~/Datasets/semeval-2017/data/clean/test.tsv'):
    gold = load_gold_file(gold_file)
    print("Loaded {} test values.".format(gold.shape[0]))
    if mode == 'probs':
        predictions, _ = load_probabilities_results(test_results_file)
        print("Loaded {} predictions.".format(predictions.shape[0]))
    elif mode == 'preds':
        predictions, predicted_labels = load_predictions_results(test_results_file)
        print("Loaded {} predictions.".format(predictions.shape[0]))
        check = accuracy_score(gold, predicted_labels)
        if check == 1:
            print("The labels match.")
    compute_semeval_metrics(gold, predictions)

In [None]:
fulldata_results_path = '/home/rafael/Datasets/semeval/results/fulldata/'

### All the runs were done using bert-base arch

## Uncased bert FT with full test data

Max seq: 128

Batch size: 32

In [19]:
test_results = fulldata_results_path + 'bert_uncased.tsv'
evaluate_test_file(test_results)

Loaded 12284 test values.
Loaded 12284 predictions.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           3030           806            136
gold:neutral            1315          3705            917
gold:positive             63           542           1770
*******EVALUATION METRICS********
Average recall:  0.7107185296142169
F1_pn =  0.7020907619141197
Accuracy:  0.6923640507977857


## Cased bert FT full text data and text cleaning

Max seq: 128

Batch size: 32

In [50]:
cased_fulldata_clean = fulldata_results_path + 'bert_cased_clean.tsv'
evaluate_test_file(cased_fulldata_clean)

Loaded 12284 predictions.
Loaded 12284 test values.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2878           928            166
gold:neutral            1276          3692            969
gold:positive             90           548           1737
*******EVALUATION METRICS********
Average recall:  0.692601106266065
F1_pn =  0.6813384251287284
Accuracy:  0.6762455226310649


## Uncased bert FT on full clean data

Max seq: 128

Batch size: 32

In [3]:
uncased_fulldata_clean = fulldata_results_path + 'bert_uncased_clean.tsv'
evaluate_test_file(uncased_fulldata_clean)

Loaded 12284 predictions.
Loaded 12284 test values.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2529          1102            341
gold:neutral            1129          3572           1236
gold:positive             88           625           1662
*******EVALUATION METRICS********
Average recall:  0.6460490292146263
F1_pn =  0.6237211639025192
Accuracy:  0.6319602735265386


### Run2

Max seq: 64

Batch size: 64

In [14]:
uncased_full_data_clean_run2 = fulldata_results_path + 'bert_uncased_clean_run2.tsv'
evaluate_test_file(uncased_full_data_clean_run2, mode='preds')

Loaded 12284 predictions.
Loaded 12284 test values.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2504          1141            327
gold:neutral            1099          3788           1050
gold:positive             80           675           1620
*******EVALUATION METRICS********
Average recall:  0.6501836099418089
F1_pn =  0.6286701298019146
Accuracy:  0.6440898730055357


## Uncased bert in-task pre-training and FT on full clean data
Max seq= 128

Batch size:32

In [18]:
bert_itpt_ftfull = fulldata_results_path + 'bert_itpt_uncased_clean.tsv'
evaluate_test_file(bert_itpt_ftfull)

Loaded 12284 test values.
Loaded 12284 predictions.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           3009           809            154
gold:neutral            1312          3750            875
gold:positive             65           588           1722
*******EVALUATION METRICS********
Average recall:  0.704745879704245
F1_pn =  0.6959488093160748
Accuracy:  0.6904102898078802


In [15]:
bert_itpt_ftfull_run2 = fulldata_results_path + 'bert_itpt_uncased_clean_run2.tsv'
evaluate_test_file(bert_itpt_ftfull_run2, mode='preds')

Loaded 12284 predictions.
Loaded 12284 test values.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2568          1101            303
gold:neutral            1152          3776           1009
gold:positive             95           710           1570
*******EVALUATION METRICS********
Average recall:  0.6478632549777826
F1_pn =  0.6284298230573535
Accuracy:  0.6442526864213611


## VADER SemEval-2017

Threshold = 0.05

In [5]:
vader_results = fulldata_results_path + 'vader_results2017.tsv'

evaluate_test_file(vader_results, mode='preds')

Loaded 12284 test values.
Loaded 12284 predictions.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2182           766           1024
gold:neutral            1271          2663           2003
gold:positive            150           554           1671
*******EVALUATION METRICS********
Average recall:  0.5671558001656213
F1_pn =  0.5243033354657394
Accuracy:  0.5304461087593618


Threshold = 0.1

In [12]:
vader_01_results = fulldata_results_path + 'vader_results_01.tsv'
evaluate_test_file(vader_01_results, mode='preds')

Loaded 12284 test values.
Loaded 12284 predictions.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2045          1037            890
gold:neutral            1111          3005           1821
gold:positive            134           608           1633
*******EVALUATION METRICS********
Average recall:  0.5695269371170385
F1_pn =  0.5246449835877318
Accuracy:  0.544041028980788
