In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

import json
from glob import glob
from pathlib import Path

In [2]:
filepaths = [Path(v) for v in glob("../run_submissions/**/**/*.tsv")]
filepaths

[WindowsPath('../run_submissions/DE/bert-base-german-cased/3Idiots_german_1_run_3.tsv'),
 WindowsPath('../run_submissions/DE/bert-base-german-cased/3Idiots_german_2_run_3.tsv'),
 WindowsPath('../run_submissions/DE/bert-base-multilingual-cased/3Idiots_german_1_run_1.tsv'),
 WindowsPath('../run_submissions/DE/bert-base-multilingual-cased/3Idiots_german_1_run_2.tsv'),
 WindowsPath('../run_submissions/DE/bert-base-multilingual-cased/3Idiots_german_2_run_1.tsv'),
 WindowsPath('../run_submissions/DE/bert-base-multilingual-cased/3Idiots_german_2_run_2.tsv'),
 WindowsPath('../run_submissions/EN/bert-base-cased/3Idiots_english_1_run_1.tsv'),
 WindowsPath('../run_submissions/EN/bert-base-cased/3Idiots_english_2_run_1.tsv'),
 WindowsPath('../run_submissions/EN/bert-base-cased/3Idiots_english_3_run_1.tsv'),
 WindowsPath('../run_submissions/EN/bert-base-uncased/3Idiots_english_1_run_2.tsv'),
 WindowsPath('../run_submissions/EN/bert-base-uncased/3Idiots_english_1_run_3.tsv'),
 WindowsPath('../run_su

In [3]:
test_filepaths = [Path(v) for v in glob("../data/raw/test_data_gold/*.tsv")]
test_filepaths

[WindowsPath('../data/raw/test_data_gold/english_data.tsv'),
 WindowsPath('../data/raw/test_data_gold/german_data.tsv'),
 WindowsPath('../data/raw/test_data_gold/hindi_data.tsv')]

In [4]:
test_filepaths[0].name.split("_")[0]

'english'

In [5]:
df_test = pd.concat({
    test_filepath.name.split("_")[0]: pd.read_csv(test_filepath, sep="\t")
    for test_filepath in test_filepaths
}, axis=0, sort=True, names=["language"]).reset_index("language").set_index("text_id").drop("text", axis=1)
df_test.head()

Unnamed: 0_level_0,language,task_1,task_2,task_3
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hasoc_en_902,english,NOT,NONE,NONE
hasoc_en_416,english,NOT,NONE,NONE
hasoc_en_207,english,NOT,NONE,NONE
hasoc_en_595,english,NOT,NONE,NONE
hasoc_en_568,english,HOF,PRFN,UNT


In [6]:
filepaths[0].name[:-4].split("_", 3)[1:]

['german', '1', 'run_3']

In [7]:
df_submissions = pd.concat({
    tuple(filepath.name[:-4].split("_", 3)[1:]): pd.read_csv(filepath, sep="\t", names=["text_id", "preds"], header=0).set_index("text_id")
    for filepath in filepaths
}, axis=0, sort=True, 
    names=["language", "subtask", "run_id"]
).reset_index(["language", "subtask", "run_id", "text_id"])
with pd.option_context("display.width", 200):
    display(
        df_submissions.pivot_table(
            index=["language", "subtask", "run_id"], 
            columns="preds", 
            aggfunc=len
        ).fillna(0).astype(int)
    )
    
df_submissions = df_submissions.set_index(["language", "text_id", "run_id", "subtask"]).unstack(["subtask"]).reset_index(["language", "run_id"])
df_submissions.columns = ["language", "run_id", "task_1_preds", "task_2_preds", "task_3_preds"]
df_submissions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text_id,text_id,text_id,text_id,text_id,text_id,text_id,text_id
Unnamed: 0_level_1,Unnamed: 1_level_1,preds,HATE,HOF,NONE,NOT,OFFN,PRFN,TIN,UNT
language,subtask,run_id,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
english,1,run_1,0,408,0,745,0,0,0,0
english,1,run_2,0,368,0,785,0,0,0,0
english,1,run_3,0,419,0,734,0,0,0,0
english,2,run_1,235,0,745,0,44,129,0,0
english,2,run_2,220,0,785,0,38,110,0,0
english,2,run_3,245,0,734,0,56,118,0,0
english,3,run_1,0,0,745,0,0,0,339,69
english,3,run_2,0,0,785,0,0,0,365,3
english,3,run_3,0,0,734,0,0,0,348,71
german,1,run_1,0,89,0,761,0,0,0,0


Unnamed: 0_level_0,language,run_id,task_1_preds,task_2_preds,task_3_preds
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hasoc_en1_1028,english,run_1,NOT,NONE,NONE
hasoc_en1_1028,english,run_2,NOT,NONE,NONE
hasoc_en1_1028,english,run_3,NOT,NONE,NONE
hasoc_en1_1037,english,run_1,NOT,NONE,NONE
hasoc_en1_1037,english,run_2,NOT,NONE,NONE


In [8]:
df_joined = df_submissions.join(df_test, how="left", rsuffix="_test")
df_joined.head()

Unnamed: 0_level_0,language,run_id,task_1_preds,task_2_preds,task_3_preds,language_test,task_1,task_2,task_3
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hasoc_de_5270,german,run_1,HOF,OFFN,,german,HOF,HATE,
hasoc_de_5270,german,run_2,NOT,NONE,,german,HOF,HATE,
hasoc_de_5270,german,run_3,NOT,NONE,,german,HOF,HATE,
hasoc_de_5271,german,run_1,NOT,NONE,,german,NOT,NONE,
hasoc_de_5271,german,run_2,NOT,NONE,,german,NOT,NONE,


In [9]:
reports = {}

for language in df_joined.language.unique():
    for task in range(1,4):
        true_col =  f"task_{task}"
        pred_col = f"task_{task}_preds"
        df_t = df_joined.loc[df_joined.language == language, ["run_id", true_col, pred_col]]
        for run_id in df_t.run_id.unique():
            df_tt = df_t[df_t.run_id == run_id].dropna()
            print(f"language={language}, task={task}, run_id={run_id}")
            if df_tt.shape[0] < 1:
                print("No data found")
                continue
            report = classification_report(df_tt[true_col], df_tt[pred_col], output_dict=True)
            print(classification_report(df_tt[true_col], df_tt[pred_col], output_dict=False))
            reports[(language, task, run_id)] = pd.DataFrame(report)
            
df_reports = pd.concat(reports)
df_reports.head()

language=german, task=1, run_id=run_1
              precision    recall  f1-score   support

         HOF       0.34      0.22      0.27       136
         NOT       0.86      0.92      0.89       714

   micro avg       0.81      0.81      0.81       850
   macro avg       0.60      0.57      0.58       850
weighted avg       0.78      0.81      0.79       850

language=german, task=1, run_id=run_2
              precision    recall  f1-score   support

         HOF       0.43      0.17      0.24       136
         NOT       0.86      0.96      0.90       714

   micro avg       0.83      0.83      0.83       850
   macro avg       0.64      0.56      0.57       850
weighted avg       0.79      0.83      0.80       850

language=german, task=1, run_id=run_3
              precision    recall  f1-score   support

         HOF       0.30      0.10      0.14       136
         NOT       0.85      0.96      0.90       714

   micro avg       0.82      0.82      0.82       850
   macro avg  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



No data found
language=german, task=3, run_id=run_2
No data found
language=german, task=3, run_id=run_3
No data found
language=english, task=1, run_id=run_1
              precision    recall  f1-score   support

         HOF       0.54      0.77      0.64       288
         NOT       0.91      0.78      0.84       865

   micro avg       0.78      0.78      0.78      1153
   macro avg       0.73      0.78      0.74      1153
weighted avg       0.82      0.78      0.79      1153

language=english, task=1, run_id=run_2
              precision    recall  f1-score   support

         HOF       0.57      0.73      0.64       288
         NOT       0.90      0.82      0.86       865

   micro avg       0.79      0.79      0.79      1153
   macro avg       0.73      0.77      0.75      1153
weighted avg       0.82      0.79      0.80      1153

language=english, task=1, run_id=run_3
              precision    recall  f1-score   support

         HOF       0.54      0.78      0.64       288
 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,HATE,HOF,NONE,NOT,OFFN,PRFN,TIN,UNT,macro avg,micro avg,weighted avg
english,1,run_1,f1-score,,0.635057,,0.842236,,,,,0.738647,0.779705,0.790486
english,1,run_1,precision,,0.541667,,0.910067,,,,,0.725867,0.779705,0.818047
english,1,run_1,recall,,0.767361,,0.783815,,,,,0.775588,0.779705,0.779705
english,1,run_1,support,,288.0,,865.0,,,,,1153.0,1153.0,1153.0
english,1,run_2,f1-score,,0.637195,,0.855758,,,,,0.746476,0.793582,0.801164


In [10]:
with pd.option_context("precision", 4):
    for metric in ["precision", "recall", "f1-score"]:
        display(df_reports.loc[
            df_reports.index.get_level_values(3) == metric, 
            ["macro avg", "micro avg", "weighted avg"]
        ].unstack(1))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,macro avg,macro avg,macro avg,micro avg,micro avg,micro avg,weighted avg,weighted avg,weighted avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,1,2,3,1,2,3,1,2,3
english,run_1,precision,0.7259,0.5025,0.4906,0.7797,0.719,0.7329,0.818,0.7743,0.7869
english,run_2,precision,0.7336,0.5499,0.4596,0.7936,0.7407,0.7641,0.8166,0.7803,0.7766
english,run_3,precision,0.7275,0.5219,0.4999,0.7788,0.7199,0.7355,0.8216,0.784,0.7937
german,run_1,precision,0.5989,0.2405,,0.8059,0.7812,,0.7769,0.7322,
german,run_2,precision,0.642,0.2747,,0.8306,0.8188,,0.7889,0.7426,
german,run_3,precision,0.5714,0.3516,,0.8188,0.8176,,0.7591,0.7492,
hindi,run_1,precision,0.8014,0.5639,0.5624,0.8027,0.6737,0.7124,0.8035,0.67,0.7453
hindi,run_2,precision,0.8,0.5514,0.5724,0.8012,0.6677,0.758,0.8023,0.6633,0.7443
hindi,run_3,precision,0.8142,0.5855,0.5915,0.8134,0.6995,0.7724,0.8137,0.6762,0.7508


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,macro avg,macro avg,macro avg,micro avg,micro avg,micro avg,weighted avg,weighted avg,weighted avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,1,2,3,1,2,3,1,2,3
english,run_1,recall,0.7756,0.5679,0.5268,0.7797,0.719,0.7329,0.7797,0.719,0.7329
english,run_2,recall,0.7709,0.5712,0.5102,0.7936,0.7407,0.7641,0.7936,0.7407,0.7641
english,run_3,recall,0.7808,0.5844,0.5432,0.7788,0.7199,0.7355,0.7788,0.7199,0.7355
german,run_1,recall,0.569,0.2586,,0.8059,0.7812,,0.8059,0.7812,
german,run_2,recall,0.5629,0.2814,,0.8306,0.8188,,0.8306,0.8188,
german,run_3,recall,0.5261,0.2722,,0.8188,0.8176,,0.8188,0.8176,
hindi,run_1,recall,0.8026,0.5806,0.5912,0.8027,0.6737,0.7124,0.8027,0.6737,0.7124
hindi,run_2,recall,0.8015,0.5733,0.5492,0.8012,0.6677,0.758,0.8012,0.6677,0.758
hindi,run_3,recall,0.8093,0.5874,0.5498,0.8134,0.6995,0.7724,0.8134,0.6995,0.7724


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,macro avg,macro avg,macro avg,micro avg,micro avg,micro avg,weighted avg,weighted avg,weighted avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,1,2,3,1,2,3,1,2,3
english,run_1,f1-score,0.7386,0.5175,0.5002,0.7797,0.719,0.7329,0.7905,0.737,0.753
english,run_2,f1-score,0.7465,0.5446,0.4765,0.7936,0.7407,0.7641,0.8012,0.7536,0.7639
english,run_3,f1-score,0.7399,0.537,0.5111,0.7788,0.7199,0.7355,0.7903,0.7417,0.7563
german,run_1,f1-score,0.5774,0.2491,,0.8059,0.7812,,0.7887,0.7559,
german,run_2,f1-score,0.5734,0.2758,,0.8306,0.8188,,0.7986,0.7779,
german,run_3,f1-score,0.5216,0.2736,,0.8188,0.8176,,0.778,0.7729,
hindi,run_1,f1-score,0.8018,0.5527,0.565,0.8027,0.6737,0.7124,0.8029,0.66,0.7265
hindi,run_2,f1-score,0.8004,0.5534,0.5492,0.8012,0.6677,0.758,0.8015,0.6598,0.7484
hindi,run_3,f1-score,0.8108,0.5812,0.5503,0.8134,0.6995,0.7724,0.8126,0.6844,0.7583
