## Evaluation with finetuned model on augmented data

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd 
from langdetect import detect, LangDetectException
from tqdm import tqdm
import numpy as np

def detect_language(s):
    try:
        return detect(s)
    except LangDetectException:
        return "unknown"

# In here, put the path to the model in your machine
state_dict = torch.load("sentiment-analysis-test/models/sentiment/model-epoch=9-val_loss=0.40.ckpt")['state_dict']
state_dict = {k.replace('process_model.',''):v for k,v in state_dict.items()}
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
model.load_state_dict(state_dict)
del state_dict
use_gpu = torch.cuda.is_available()
if use_gpu:
    model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")

In [15]:
test = pd.read_csv("./sentiment-analysis-test/data/val_clean.csv")
test['lang'] = test['content'].apply(lambda x: detect_language(x))

In [16]:
predictions = []

for item in tqdm(test.values):
    tokenized = tokenizer(item[0], return_tensors='pt',
            max_length=256,
            padding="max_length",
            truncation=True)
    if use_gpu:
        tokenized = tokenized.to("cuda")
    with torch.no_grad():
        out = model(**tokenized)
    predictions.append(np.argmax(out.logits.detach().cpu().numpy()))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:51<00:00, 44.89it/s]


In [17]:
LABEL_TO_ID = {"negative": 0, "neutral": 1, "positive": 2}
true_labels = [LABEL_TO_ID[item] for item in test['sentiment'].tolist()]

In [6]:
#import pickle 
#with open('preds_finetuned_augmented.pkl', 'wb') as f:
#    pickle.dump(predictions, f)

In [7]:
#import pickle 
#with open('preds_finetuned_augmented.pkl', 'rb') as f:
#    predictions = pickle.load(f)
#import pandas as pd 
#from langdetect import detect, LangDetectException
#def detect_language(s):
#    try:
#        return detect(s)
#    except LangDetectException:
#        return "unknown"
#test = pd.read_csv("./sentiment-analysis-test/data/val_clean.csv")
#test['lang'] = test['content'].apply(lambda x: detect_language(x))
#LABEL_TO_ID = {"negative": 0, "neutral": 1, "positive": 2}
#true_labels = [LABEL_TO_ID[item] for item in test['sentiment'].tolist()]

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(f'Accuracy: {accuracy_score(true_labels, predictions)}')
matrix = confusion_matrix(true_labels, predictions)
acc_classes = matrix.diagonal()/matrix.sum(axis=1)
for i in range(len(acc_classes)):
    print(f"Accucary for {list(LABEL_TO_ID.keys())[i]}: {acc_classes[i]}")

Accuracy: 0.8458
Accucary for negative: 0.838319541693189
Accucary for neutral: 0.8906515580736544
Accucary for positive: 0.8052884615384616


In [30]:
data_lang_dic = {}
from tqdm import tqdm
for i, item in tqdm(enumerate(test.values)):
    if item[2] not in data_lang_dic.keys():
        data_lang_dic[item[2]] = []
    data_lang_dic[item[2]].append([item[0], item[1], predictions[i]])

5000it [00:00, 1079559.35it/s]


In [31]:
table_lang = []
for k,v in data_lang_dic.items():
    true_lab = [LABEL_TO_ID[item[1]] for item in v]
    pred_lab = [item[2] for item in v]
    matrix = confusion_matrix(true_lab, pred_lab)
    #print(f"For the language {k}: Accuracy is: {accuracy_score(true_lab, pred_lab)}")
    #print(f"Accuracy per class is : {matrix.diagonal()/matrix.sum(axis=1)}")
    #print('---------------------------------------------------')
    table_lang.append([k, accuracy_score(true_lab, pred_lab), matrix.diagonal()/matrix.sum(axis=1), len(v)])

In [32]:
table_lang_sorted = sorted(table_lang, key=lambda k: -k[3])

In [33]:
lang_acc_df = pd.DataFrame(table_lang_sorted, columns= ["lang", "accuracy", "acc per class", "size"])
lang_acc_df.head(10)

Unnamed: 0,lang,accuracy,acc per class,size
0,en,0.814815,"[0.7953667953667953, 0.8557377049180328, 0.787...",837
1,ru,0.93769,"[0.9298245614035088, 0.9826839826839827, 0.902...",658
2,id,0.864865,"[0.8644859813084113, 0.9427312775330396, 0.771...",629
3,ar,0.835735,"[0.8157894736842105, 0.9148936170212766, 0.739...",347
4,fr,0.848185,"[0.8775510204081632, 0.8817204301075269, 0.794...",303
5,es,0.850746,"[0.8390804597701149, 0.8518518518518519, 0.86]",268
6,pt,0.892704,"[0.9036144578313253, 0.8939393939393939, 0.880...",233
7,ko,0.753488,"[0.8222222222222222, 0.6914893617021277, 0.789...",215
8,zh-cn,0.731183,"[0.7941176470588235, 0.6290322580645161, 0.767...",186
9,ja,0.907975,"[0.8703703703703703, 0.9183673469387755, 0.933...",163


In [28]:
lang_acc_df.to_csv("accuracy_per_language_pretrained.csv", index=False)