In [None]:
import re

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sentify import WANDB_EXPORTS_DIR

In [None]:
filename = 'wandb_export_2022-05-02T14_04_16.453+02_00.csv'
filepath = WANDB_EXPORTS_DIR.joinpath(filename)
df_results = pd.read_csv(filepath).query('State == "finished"')

df_results

In [None]:
df_results.info()

In [None]:
import numpy as np

DATASETS = [
    'measuring_hate_speech',
    'yelp',
    'imdb',
    'sentiment140',
    'MHS_sentiment',
    'MHS_hatespeech',
    'wiki_attack',
    'wiki_aggression',
    'wiki+toxicity',
]


def _process_name(name):
    dataset = ''
    for dataset_name in DATASETS:
        if dataset_name in name:
            name = name[len(dataset_name):]
            dataset = dataset_name
            break

    method = re.search(r'_(.*)_2022', name).group(1)
    return {'dataset': dataset, 'method': method}


df_results['time'] = pd.to_datetime(df_results['Created'])
df_results = df_results.replace('None', np.nan)

dataset_method = df_results['Name'].map(_process_name).apply(pd.Series)
df_results = df_results.join(dataset_method, rsuffix='_', how='left')

df_results = df_results.drop(columns=['dataset', 'method'])
df_results = df_results.rename(columns={
    'dataset_': 'dataset',
    'method_': 'method',
})

df_results.head()

In [None]:
df_results.dataset.unique()

In [None]:
df_results.method.unique()

In [None]:
# Retriever
df_results.loc[
    (df_results['method'] == 'retriever')
    & (df_results['mean_center_embeddings'] != True)
    & df_results['top_k'].isna()
    & df_results['encoder_name'].isna(),
    'method'
] = 'retriever'
df_results.loc[
    (df_results['method'] == 'retriever')
    & df_results['top_k'].isna()
    & (df_results['encoder_name'] == "cross-encoder/stsb-roberta-base")
    & df_results['normalize_features'],
    'method'
] = 'retriever cross roberta norm=True'
df_results.loc[
    (df_results['method'] == 'retriever')
    & df_results['top_k'].isna()
    & (df_results['encoder_name'] == "cross-encoder/stsb-roberta-base")
    & (df_results['feature_normalization'] == "none"),
    'method'
] = 'retriever cross roberta norm=none'
df_results.loc[
    (df_results['method'] == 'retriever')
    & df_results['top_k'].isna()
    & (df_results['encoder_name'] == "cross-encoder/stsb-distilroberta-base")
    & (df_results['feature_normalization'] == "none"),
    'method'
] = 'retriever cross distilroberta norm=none'
df_results.loc[
    (df_results['method'] == 'retriever') & df_results['mean_center_embeddings'] & df_results['top_k'].isna(),
    'method'
] = 'center retriever'

# KNN = 3
df_results.loc[
    (df_results['method'] == 'retriever')
    & (df_results['mean_center_embeddings'] != True)
    & (df_results['top_k'] == "3")
    & df_results['encoder_name'].isna(),
    'method'
] = 'retriever knn=3'
df_results.loc[
    (df_results['method'] == 'retriever')
    & (df_results['top_k'] == "3")
    & (df_results['encoder_name'] == "cross-encoder/stsb-roberta-base")
    & (df_results['feature_normalization'] == "none"),
    'method'
] = 'retriever cross roberta norm=none knn=3'
df_results.loc[
    (df_results['method'] == 'retriever')
    & df_results['mean_center_embeddings']
    & (df_results['top_k'] == "3")
    & df_results['encoder_name'].isna(),
    'method'
] = 'center retriever knn=3'

# KNN = 5
df_results.loc[
    (df_results['method'] == 'retriever')
    & (df_results['mean_center_embeddings'] != True)
    & ( df_results['top_k'] == "5")
    & df_results['encoder_name'].isna(),
    'method'
] = 'retriever knn=5'
df_results.loc[
    (df_results['method'] == 'retriever')
    & df_results['mean_center_embeddings']
    & (df_results['top_k'] == "5")
    & df_results['encoder_name'].isna(),
    'method'
] = 'center retriever knn=5'


# SentiLARE
df_results.loc[
    (df_results['method'] == 'retriever_sentiLARE')
    & (df_results['mean_center_embeddings'] != True)
    & df_results['top_k'].isna(),
    'method'
] = 'retriever_sentiLARE'
df_results.loc[
    (df_results['method'] == 'retriever_sentiLARE')
    & (df_results['mean_center_embeddings'] != True)
    & (df_results['top_k'] == "3"),
    'method'
] = 'retriever_sentiLARE knn=3'
df_results.loc[
    (df_results['method'] == 'retriever_sentiLARE')
    & (df_results['mean_center_embeddings'] != True)
    & (df_results['top_k'] == "5"),
    'method'
] = 'retriever_sentiLARE knn=5'

df_results.loc[
    (df_results['method'] == 'retriever_sentiLARE')
    & df_results['mean_center_embeddings']
    & df_results['top_k'].isna(),
    'method'
] = 'center retriever_sentiLARE'
df_results.loc[
    (df_results['method'] == 'retriever_sentiLARE')
    & df_results['mean_center_embeddings']
    & (df_results['top_k'] == "3"),
    'method'
] = 'center retriever_sentiLARE knn=3'
df_results.loc[
    (df_results['method'] == 'retriever_sentiLARE')
    & df_results['mean_center_embeddings']
    & (df_results['top_k'] == "5"),
    'method'
] = 'center retriever_sentiLARE knn=5'

df_results

In [None]:
plt.figure(figsize=(12, 6))
plt.ylim(0.4, 1.0)
sns.barplot(
    x='dataset',
    y='test/f1_score',
    hue='method',
    data=df_results,
)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
df_results.groupby(by=['dataset', 'method']).agg({
    'test/f1_score': ['mean', 'std'],
    'method': ['count']
})