## Setup

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

import warnings
warnings.filterwarnings('ignore')

In [7]:
cleaned_df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text_a  2800 non-null   object
 1   label   2800 non-null   object
dtypes: object(2)
memory usage: 43.9+ KB


In [9]:
y_pred = pickle.load(open(sg_experiment_path + "/y_pred.pkl", 'rb'))

y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

## Load

In [10]:
FOLDER = "../bin/w2v"
sg_experiment_path = FOLDER + "/exp8"
cbow_experiment_path = FOLDER + "/exp1"

cleaned_df_test = pd.read_csv('../data/test.csv')
df_test = pd.DataFrame({
    'cleaned_text': cleaned_df_test['text_a'],
    'label': cleaned_df_test['label']
})

df_without_label = df_test.drop(columns='label')

IDX2LABEL = {0: 'no', 1: 'yes'}
LABEL2IDX = {'no': 0, 'yes': 1}

def conv_pickle_to_csv(path):
    temp_df = df_without_label.copy()
    y_pred = pickle.load(open(path, 'rb'))
    y_pred = np.argmax(y_pred, axis=1)
    y_pred = [IDX2LABEL[el] for el in y_pred]
    temp_df['prediction'] = y_pred
    return temp_df

prediction = {
    "cbow" : conv_pickle_to_csv(cbow_experiment_path + "/y_pred.pkl"),
    "sg" : conv_pickle_to_csv(sg_experiment_path + "/y_pred.pkl"),
}

## Scores

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score

y_true = df_test['label']
evaluations = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'classification_report': classification_report,
    'confusion_matrix': confusion_matrix
}

cleaned_scores = {}

for lt, pred in prediction.items():
    cleaned_scores[lt] = {}
    for eval_name, eval_func in evaluations.items():
        if eval_name in ['precision', 'recall']:
            cleaned_scores[lt][eval_name] = eval_func(y_true, pred["prediction"], pos_label='yes') 
        else :
            cleaned_scores[lt][eval_name] = eval_func(y_true, pred["prediction"]) 

In [26]:
def summarize_score(scores, printed_evaluation_keys=['accuracy', 'precision', 'recall']):
    for model_name, params in scores.items():
        print(model_name)
        for evaluation_name in printed_evaluation_keys:
            if evaluation_name in ['confusion_matrix', 'classification_report']:
                print('\t\t', evaluation_name)
                print(params[evaluation_name])
            else:
                print('\t\t', evaluation_name, ': ', params[evaluation_name])

In [27]:
summarize_score(cleaned_scores)

cbow
		 accuracy :  0.8357142857142857
		 precision :  0.6926677067082684
		 recall :  0.628005657708628
sg
		 accuracy :  0.835
		 precision :  0.7239488117001828
		 recall :  0.5601131541725601


In [28]:
summarize_score(cleaned_scores, printed_evaluation_keys=['confusion_matrix'])

cbow
		 confusion_matrix
[[1896  197]
 [ 263  444]]
sg
		 confusion_matrix
[[1942  151]
 [ 311  396]]


## Evaluation

In [34]:
prediction["sg"]

Unnamed: 0,cleaned_text,prediction
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,no
...,...,...
2795,ku tenang2 bae ku sih ya corona nya ga depok k...,no
2796,guru hati hati ya virus corona uda indonesia t...,no
2797,4 terawan menyebut virus corona indonesia terd...,yes
2798,realffk buhari can t pronounce corona virus,no


In [31]:
df_evaluation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   cleaned_text  2800 non-null   object
 1   label         2800 non-null   object
dtypes: object(2)
memory usage: 43.9+ KB


In [35]:
df_evaluation = df_test.copy()
df_evaluation["sg"] = prediction["sg"]["prediction"]
df_evaluation["cbow"] = prediction["cbow"]["prediction"]
def wrong_preds(model_name):
    return df_evaluation[df_evaluation['label'] != df_evaluation[model_name]][['cleaned_text', 'label', model_name]]

In [36]:
wrongs = {
    'cbow': wrong_preds('cbow'),
    'sg': wrong_preds('sg'),
}

In [37]:
def print_text_preds(df, model_name, text_col='cleaned_text'):
    for text, preds in zip(df[text_col], df[model_name]):
        print(f'{text} -- {preds}')

In [38]:
print_text_preds(wrongs['cbow'], 'cbow')

doakan indonesia selamat virus corona pkb depok gelar nusantara bershalawat -- no
warga depok terganggu isu corona -- no
mantap corona depok permintaan sereh langsung gacoooor pedagang pasar nyiapin stok -- no
tetangga 2 pasien corona depok diimbau rumah 14 14 warga tinggal rumah pasien corona depok jawa barat diimbau aktivitas petugas dinas kesehatan sumber merdeka com -- yes
corona depok puncak gunung es -- no
putraerlangga gubernur jabar ridwankamil virus corona bln hadir kota depok panik beliau warga antisipasi aja -- yes
kegiatan rutinan mds rijalul ansor kabupaten brebes berjalan rutin 1 yg ditempatkan pac dzikir shalawat rutinan mds jg diadakan diskusi mengangkat tema2 aktual masyarakat ansorbanserbrebes -- yes
senam kaya mencegah virus corona ga sih -- no
sing gowo virus corona neng indonesia fix asui -- yes
salah upaya pencegahan corona dr keluarga ku menjemur badan -- no
jrantesalu8 aniesbaswedan permadiaktivis gunromli dennysiregar7 tsamaradki wagimandeep mengumpulkan orang 