# Performance Evaluation for Different Sentiment Analysis systems

This notebook documents the results of evaluating different sentiment analysis systems using the datasets from SemEval 2017 Task 4-A.

In [2]:
import os
import sys
import re
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [3]:

def load_gold_file(input_file):
    '''
    Line structure is "sentiment \t tweet"
    '''
    senti_dict = {
        'negative':0,
        'neutral':1,
        'positive':2,
    }
    
    file_path = os.path.expanduser(input_file)
    sentiments = []
    with open(file_path, 'r') as file:
        for line in file:
            tokens = re.split(r'\t',line)
            sentiments.append(senti_dict[tokens[0]])
    gold = np.array(sentiments)
    
    return gold

def load_probabilities_results(input_file):
    results = pd.read_csv(input_file,
                           sep="\t",
                           header=None,
                           names=["negative", "neutral", "positive"])
    probabilities = results.to_numpy()
    predictions = probabilities.argmax(axis=1)
    return predictions, probabilities

def load_predictions_results(input_file):
    file_path = os.path.expanduser(input_file)
    results, labels = [], []
    with open(file_path, "r") as re_file:
        for line in re_file:
            tokens = re.split(r'\t', line)
            results.append(int(tokens[0]))
            labels.append(int(tokens[1]))
    predictions = np.array(results)
    gold_labels = np.array(labels)
        
    return predictions, gold_labels

def compute_semeval_metrics(gold, predictions):
    
    def _multilabel_recall(index, cmtx):
        '''
        Recall is defined as the proportion between correctly classified relevant classes and 
        all the known relevant classes.
        recall = TP / TP + FN
        '''
        true_gold = cmtx.iloc[index, index]
        all_gold = np.sum(cmtx.iloc[index,:].to_numpy())
        return true_gold / all_gold
    
    def _multilabel_precision(index, cmtx):
        '''
        Precision is defined as the proportion between correctly classified cases and all the classified cases of
        class.
        recall = TP / TP + FP
        '''
        true_pred = cmtx.iloc[index, index]
        false_pred = np.sum(cmtx.iloc[:,index].to_numpy())
        return true_pred / false_pred
    
    cmtx = pd.DataFrame(
        confusion_matrix(gold, predictions, labels=[0,1,2]), 
        index=['gold:negative', 'gold:neutral', 'gold:positive'], 
        columns=['pred:negative', 'pred:neutral', 'pred:positive']
    )
    
    #accuracy
    acc = accuracy_score(gold, predictions)
    
    #recall
    negative_recall = _multilabel_recall(0, cmtx)
    neutral_recall = _multilabel_recall(1, cmtx)
    positive_recall = _multilabel_recall(2, cmtx)
    avg_r = (negative_recall + neutral_recall + positive_recall) / 3
    
    #precision
    negative_precision = _multilabel_precision(0, cmtx)
    positive_precision = _multilabel_precision(2, cmtx)
    
    #f1
    negative_f1 = (2*negative_precision*negative_recall) / (negative_precision+negative_recall)
    positive_f1 = (2*positive_precision*positive_recall) / (positive_precision+positive_recall)
    f1_pn = (positive_f1 + negative_f1) / 2
    
    
    print('*******CONFUSION MATRIX*******')
    print(cmtx)
    print('*******EVALUATION METRICS********')
    print('Average recall: ', avg_r)
    print('F1_pn = ', f1_pn)
    print("Accuracy: ", acc)
    
def evaluate_test_file(test_results_file, mode='probs', gold_file='~/Datasets/semeval-2017/data/clean/test.tsv'):
    gold = load_gold_file(gold_file)
    print("Loaded {} test values.".format(gold.shape[0]))
    if mode == 'probs':
        predictions, _ = load_probabilities_results(test_results_file)
        print("Loaded {} predictions.".format(predictions.shape[0]))
    elif mode == 'preds':
        predictions, predicted_labels = load_predictions_results(test_results_file)
        print("Loaded {} predictions.".format(predictions.shape[0]))
        check = accuracy_score(gold, predicted_labels)
        if check == 1:
            print("The labels match.")
    compute_semeval_metrics(gold, predictions)

In [None]:
fulldata_results_path = '/home/rafael/Datasets/semeval/results/fulldata/'

## VADER SemEval-2017

Threshold = 0.05

In [5]:
vader_results = fulldata_results_path + 'vader_results2017.tsv'

evaluate_test_file(vader_results, mode='preds')

Loaded 12284 test values.
Loaded 12284 predictions.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2182           766           1024
gold:neutral            1271          2663           2003
gold:positive            150           554           1671
*******EVALUATION METRICS********
Average recall:  0.5671558001656213
F1_pn =  0.5243033354657394
Accuracy:  0.5304461087593618


Threshold = 0.1

In [12]:
vader_01_results = fulldata_results_path + 'vader_results_01.tsv'
evaluate_test_file(vader_01_results, mode='preds')

Loaded 12284 test values.
Loaded 12284 predictions.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2045          1037            890
gold:neutral            1111          3005           1821
gold:positive            134           608           1633
*******EVALUATION METRICS********
Average recall:  0.5695269371170385
F1_pn =  0.5246449835877318
Accuracy:  0.544041028980788


## Experiments with different BERT configurations

#### Different configurations are:

|Model | Init Checkpoint | Pre-train | Seq.Length | Batch Size |
|:--- | :---: | :---: | :---: | :---: |
|**BERT_M1** | cased_based  | No  | 128 | 32 |
|**BERT_M2** | uncased_base | No  | 128 | 32 |
|**BERT_M3** | uncased_base | yes | 128 | 32 |
|**BERT_M4** | uncased_base | yes | 64  | 64 |


The experiments were conducted on the UMinho cluster. 

The performance measurements were obtained with the *evaluate_test_file* function, added to the *custom_bert_predict.py* script.

The prediction script output is a file with the performance measurements. For each above BERT configuration, a correspondent file was generated. These files are available in the *results-from-server* folder

#### The output for each file, and each configuration is given bellow.

##### BERT_M1.txt
```
***** Predict results *****
Loaded 12284 predictions.
Loaded 12284 test values.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2878           928            166
gold:neutral            1276          3692            969
gold:positive             90           548           1737
*******EVALUATION METRICS********
Average recall:  0.692601106266065
F1_pn =  0.6813384251287284
Accuracy:  0.6762455226310649
```
##### BERT_M2.txt
```
***** Predict results *****
Loaded 12284 test values.
Loaded 12284 predictions.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2979           870            123
gold:neutral            1271          3898            768
gold:positive             63           621           1691
*******EVALUATION METRICS********
Average recall:  0.7061868508225254
F1_pn =  0.7006992300349085
Accuracy:  0.6974926733962878
```
##### BERT_M3.txt
```
***** Predict results *****
Loaded 12284 test values.
Loaded 12284 predictions.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2978           867            127
gold:neutral            1254          3841            842
gold:positive             56           570           1749
*******EVALUATION METRICS********
Average recall:  0.7110430114245548
F1_pn =  0.7039452146491718
Accuracy:  0.6974926733962878
```

##### BERT_M4.txt
```
***** Predict results *****
Loaded 12284 test values.
Loaded 12284 predictions.
The labels match.
*******CONFUSION MATRIX*******
               pred:negative  pred:neutral  pred:positive
gold:negative           2845           936            191
gold:neutral            1128          3741           1068
gold:positive             44           460           1871
*******EVALUATION METRICS********
Average recall:  0.7113898469753331
F1_pn =  0.6959875005243927
Accuracy:  0.6884565288179746
```