## Evaluation with pretrained model

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd 
import torch
from langdetect import detect, LangDetectException
from tqdm import tqdm
import numpy as np

def detect_language(s):
    try:
        return detect(s)
    except LangDetectException:
        return "unknown"

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
use_gpu = torch.cuda.is_available()
if use_gpu:
    model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")

In [2]:
test = pd.read_csv("./sentiment-analysis-test/data/val_clean.csv")
test['lang'] = test['content'].apply(lambda x: detect_language(x))

In [5]:
predictions = []
for item in tqdm(test.values):
    tokenized = tokenizer(item[0], return_tensors='pt',
            max_length=256,
            padding="max_length",
            truncation=True)
    if use_gpu:
        tokenized = tokenized.to("cuda")
    with torch.no_grad():
        out = model(**tokenized)
    predictions.append(np.argmax(out.logits.detach().cpu().numpy()))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [02:06<00:00, 39.39it/s]


In [4]:
LABEL_TO_ID = {"negative": 0, "neutral": 1, "positive": 2}
true_labels = [LABEL_TO_ID[item] for item in test['sentiment'].tolist()]

In [19]:
#import pickle 
#with open('preds.pkl', 'wb') as f:
#    pickle.dump(predictions, f)

In [20]:
#import pickle 
#with open('preds.pkl', 'rb') as f:
#    predictions = pickle.load(f)
#import pandas as pd 
#from langdetect import detect, LangDetectException
#def detect_language(s):
#    try:
#        return detect(s)
#    except LangDetectException:
#        return "unknown"
#test = pd.read_csv("./sentiment-analysis-test/data/val_clean.csv")
#test['lang'] = test['content'].apply(lambda x: detect_language(x))
#LABEL_TO_ID = {"negative": 0, "neutral": 1, "positive": 2}
#true_labels = [LABEL_TO_ID[item] for item in test['sentiment'].tolist()]

In [5]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(f'Accuracy: {accuracy_score(true_labels, predictions)}')
matrix = confusion_matrix(true_labels, predictions)
acc_classes = matrix.diagonal()/matrix.sum(axis=1)
for i in range(len(acc_classes)):
    print(f"Accucary for {list(LABEL_TO_ID.keys())[i]}: {acc_classes[i]}")

Accuracy: 0.6486
Accucary for negative: 0.7211966900063653
Accucary for neutral: 0.5478753541076488
Accucary for positive: 0.6868990384615384


In [6]:
data_lang_dic = {}
from tqdm import tqdm
for i, item in tqdm(enumerate(test.values)):
    if item[2] not in data_lang_dic.keys():
        data_lang_dic[item[2]] = []
    data_lang_dic[item[2]].append([item[0], item[1], predictions[i]])

5000it [00:00, 572398.06it/s]


In [7]:
table_lang = []
for k,v in data_lang_dic.items():
    true_lab = [LABEL_TO_ID[item[1]] for item in v]
    pred_lab = [item[2] for item in v]
    matrix = confusion_matrix(true_lab, pred_lab)
    #print(f"For the language {k}: Accuracy is: {accuracy_score(true_lab, pred_lab)}")
    #print(f"Accuracy per class is : {matrix.diagonal()/matrix.sum(axis=1)}")
    #print('---------------------------------------------------')
    table_lang.append([k, accuracy_score(true_lab, pred_lab), matrix.diagonal()/matrix.sum(axis=1), len(v)])

  table_lang.append([k, accuracy_score(true_lab, pred_lab), matrix.diagonal()/matrix.sum(axis=1), len(v)])
  table_lang.append([k, accuracy_score(true_lab, pred_lab), matrix.diagonal()/matrix.sum(axis=1), len(v)])
  table_lang.append([k, accuracy_score(true_lab, pred_lab), matrix.diagonal()/matrix.sum(axis=1), len(v)])


In [8]:
table_lang_sorted = sorted(table_lang, key=lambda k: -k[3])

In [9]:
lang_acc_df = pd.DataFrame(table_lang_sorted, columns= ["lang", "accuracy", "acc per class", "size"])
lang_acc_df.head(10)

Unnamed: 0,lang,accuracy,acc per class,size
0,en,0.619448,"[0.7003891050583657, 0.5249169435215947, 0.647...",833
1,ru,0.690076,"[0.6842105263157895, 0.6724890829694323, 0.709...",655
2,id,0.677215,"[0.8018433179723502, 0.5394736842105263, 0.700...",632
3,ar,0.561047,"[0.6607142857142857, 0.4397163120567376, 0.626...",344
4,fr,0.69281,"[0.8787878787878788, 0.40425531914893614, 0.76...",306
5,es,0.662963,"[0.7209302325581395, 0.5, 0.75]",270
6,pt,0.729614,"[0.7926829268292683, 0.5970149253731343, 0.773...",233
7,ko,0.665116,"[0.8409090909090909, 0.5463917525773195, 0.716...",215
8,zh-cn,0.589189,"[0.7647058823529411, 0.3333333333333333, 0.649...",185
9,ja,0.790123,"[0.7777777777777778, 0.6458333333333334, 0.916...",162


In [None]:
lang_acc_df.to_csv("accuracy_per_language_pretrained.csv", index=False)