## Evaluation with finetuned model

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd 
from langdetect import detect, LangDetectException
from tqdm import tqdm
import numpy as np

def detect_language(s):
    try:
        return detect(s)
    except LangDetectException:
        return "unknown"

# In here, put the path to the model in your machine
state_dict = torch.load("sentiment-analysis-test/models/sentiment/model-epoch=9-val_loss=0.40.ckpt")['state_dict']
state_dict = {k.replace('process_model.',''):v for k,v in state_dict.items()}
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
model.load_state_dict(state_dict)
del state_dict
use_gpu = torch.cuda.is_available()
if use_gpu:
    model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")

In [2]:
test = pd.read_csv("./sentiment-analysis-test/data/val_clean.csv")
test['lang'] = test['content'].apply(lambda x: detect_language(x))

In [3]:
predictions = []

for item in tqdm(test.values):
    tokenized = tokenizer(item[0], return_tensors='pt',
            max_length=256,
            padding="max_length",
            truncation=True)
    if use_gpu:
        tokenized = tokenized.to("cuda")
    with torch.no_grad():
        out = model(**tokenized)
    predictions.append(np.argmax(out.logits.detach().cpu().numpy()))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:50<00:00, 45.35it/s]


In [4]:
LABEL_TO_ID = {"negative": 0, "neutral": 1, "positive": 2}
true_labels = [LABEL_TO_ID[item] for item in test['sentiment'].tolist()]

In [19]:
#import pickle 
#with open('preds.pkl', 'wb') as f:
#    pickle.dump(predictions, f)

In [20]:
#import pickle 
#with open('preds.pkl', 'rb') as f:
#    predictions = pickle.load(f)
#import pandas as pd 
#from langdetect import detect, LangDetectException
#def detect_language(s):
#    try:
#        return detect(s)
#    except LangDetectException:
#        return "unknown"
#test = pd.read_csv("./sentiment-analysis-test/data/val_clean.csv")
#test['lang'] = test['content'].apply(lambda x: detect_language(x))
#LABEL_TO_ID = {"negative": 0, "neutral": 1, "positive": 2}
#true_labels = [LABEL_TO_ID[item] for item in test['sentiment'].tolist()]

In [5]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(f'Accuracy: {accuracy_score(true_labels, predictions)}')
matrix = confusion_matrix(true_labels, predictions)
acc_classes = matrix.diagonal()/matrix.sum(axis=1)
for i in range(len(acc_classes)):
    print(f"Accucary for {list(LABEL_TO_ID.keys())[i]}: {acc_classes[i]}")

Accuracy: 0.846
Accucary for negative: 0.8109484404837684
Accucary for neutral: 0.9031161473087819
Accucary for positive: 0.8185096153846154


In [6]:
data_lang_dic = {}
from tqdm import tqdm
for i, item in tqdm(enumerate(test.values)):
    if item[2] not in data_lang_dic.keys():
        data_lang_dic[item[2]] = []
    data_lang_dic[item[2]].append([item[0], item[1], predictions[i]])

5000it [00:00, 1233256.10it/s]


In [7]:
table_lang = []
for k,v in data_lang_dic.items():
    true_lab = [LABEL_TO_ID[item[1]] for item in v]
    pred_lab = [item[2] for item in v]
    matrix = confusion_matrix(true_lab, pred_lab)
    #print(f"For the language {k}: Accuracy is: {accuracy_score(true_lab, pred_lab)}")
    #print(f"Accuracy per class is : {matrix.diagonal()/matrix.sum(axis=1)}")
    #print('---------------------------------------------------')
    table_lang.append([k, accuracy_score(true_lab, pred_lab), matrix.diagonal()/matrix.sum(axis=1), len(v)])

In [8]:
table_lang_sorted = sorted(table_lang, key=lambda k: -k[3])

In [11]:
lang_acc_df = pd.DataFrame(table_lang_sorted, columns= ["lang", "accuracy", "acc per class", "size"])
lang_acc_df.head(10)

Unnamed: 0,lang,accuracy,acc per class,size
0,en,0.814681,"[0.7644787644787645, 0.87, 0.8014705882352942]",831
1,ru,0.942161,"[0.9069767441860465, 0.9826839826839827, 0.929...",657
2,id,0.866455,"[0.8564814814814815, 0.9508928571428571, 0.777...",629
3,ar,0.850144,"[0.8230088495575221, 0.9290780141843972, 0.763...",347
4,fr,0.850993,"[0.8469387755102041, 0.8829787234042553, 0.827...",302
5,es,0.849817,"[0.8181818181818182, 0.8571428571428571, 0.871...",273
6,pt,0.896104,"[0.9012345679012346, 0.9090909090909091, 0.880...",231
7,ko,0.760369,"[0.813953488372093, 0.7272727272727273, 0.7733...",217
8,zh-cn,0.72043,"[0.75, 0.65, 0.7586206896551724]",186
9,ja,0.91358,"[0.8518518518518519, 0.9583333333333334, 0.933...",162


In [None]:
lang_acc_df.to_csv("accuracy_per_language_pretrained.csv", index=False)