In [1]:
import pandas as pd
import seaborn as sns
import json
import re

def generate_dataset():
    with open('stats_without_preprocessing.json','r') as f:
        stats = json.load(f)
    
    df = pd.DataFrame(stats)
    df['preprocessed'] = False
    
    with open('stats_with_preprocessing.json','r') as f:
        stats = json.load(f)

    tmp = pd.DataFrame(stats)
    tmp['preprocessed'] = True
    df = df.append(tmp)
    
    with open('stats.json') as f:
        bert_stats = json.load(f)
    bert_df = pd.DataFrame(bert_stats)
    bert_df['preprocessed'] = False
    
    df = df.append(bert_df)
    df = df.reset_index().drop(columns='index')
    df['pearson'] = df['pearson'].apply(lambda x: round(float(x), 3))
    df['MSE'] = df['MSE'].apply(lambda x: round(float(x), 3))
    
    relevant_cols = set(df.columns) - set(['MSE', 'pearson', 'timestamp'])
    df = df.sort_values(by='timestamp')
    df = df.drop_duplicates(subset=relevant_cols, keep='last')
    df = df.sort_values(by='pearson', ascending=False)
    return df

def get_language_model(x):
    for item in ['ELMo_wiki', 'ELMo_default', 'ELMo_brwac', 'BERT']:
        if item in x:
            return item
    else:
        return None

def get_unk(x):
    if 'unk' in x:
        return True
    else:
        return False 

def get_word_embedding(x):
    if not 'NILC' in x:
        return None
    return ' : '.join(x.rstrip('.model').split('/')[-2:len(x)])

def get_word_embedding_architecture(x):
    if not x:
        return None
    if 'skip' in x:
        return 'skip-gram'
    if 'cbow' in x:
        return 'CBOW'
    return None

def get_word_embedding_size(x):
    if not x:
        return None
    return re.findall(r's\d+.', x)[0].lstrip('s')

def isolate_word_embedding_name(x):
    if not x:
        return None
    return x.split(':')[0]

def retrieve_results(chosen_language='ptbr'):
    df = generate_dataset()
    df['language_model'] = df['test'].apply(get_language_model)
    df['unk'] = df['test'].apply(get_unk)
    df['word_embedding'] = df['test'].apply(get_word_embedding)
    df['word_embedding_architecture'] = df['word_embedding'].apply(get_word_embedding_architecture)
    df['word_embedding_size'] = df['word_embedding'].apply(get_word_embedding_size)
    df['word_embedding'] = df['word_embedding'].apply(isolate_word_embedding_name)
    df = df.rename(columns={'lang': 'dataset_language'})
    df = df[df['dataset_language'] == chosen_language]
    df = df[['dataset_language', 
             'word_embedding', 
             'language_model', 
             'word_embedding_architecture', 
             'word_embedding_size',
            'unk',
            'preprocessed',
            'pearson',
            'MSE']]
    df = df.reset_index().drop(columns='index')
    df = df.sort_values(by='pearson', ascending=False)
    return df

retrieve_results(chosen_language='ptbr').sample(10)

Unnamed: 0,dataset_language,word_embedding,language_model,word_embedding_architecture,word_embedding_size,unk,preprocessed,pearson,MSE
137,ptbr,wang2vec,,CBOW,600,True,False,0.44,0.613
116,ptbr,word2vec,,CBOW,100,False,False,0.458,0.6
127,ptbr,wang2vec,,CBOW,1000,True,False,0.446,0.609
165,ptbr,fasttext,,CBOW,600,False,True,0.367,0.658
47,ptbr,word2vec,,CBOW,300,True,True,0.547,0.533
106,ptbr,glove,ELMo_wiki,,50,False,True,0.473,0.591
91,ptbr,word2vec,,skip-gram,300,True,False,0.484,0.582
34,ptbr,word2vec,,CBOW,600,False,True,0.569,0.514
158,ptbr,fasttext,,CBOW,300,True,True,0.372,0.656
70,ptbr,glove,ELMo_default,,600,False,True,0.497,0.575


In [2]:
retrieve_results(chosen_language='ptbr').to_csv('propor2020_test_results_ptbr.csv')
retrieve_results(chosen_language='pteu').to_csv('propor2020_test_results_pteu.csv')

In [3]:
def get_preprocessed_diff(df):
    pp_df = df[ df['preprocessed'] == True ]
    pp_df = pp_df[pp_df['language_model'].isnull()]
    non_pp_df = df[df['preprocessed'] == False ]
    non_pp_df = non_pp_df[non_pp_df['language_model'].isnull()]
    dataplot = pp_df.merge(non_pp_df, how='inner', on=['word_embedding', 'language_model', 'word_embedding_architecture', 'word_embedding_size', 'unk'])
    dataplot['pearson_diff'] = dataplot['pearson_x'] - dataplot['pearson_y']
    dataplot = dataplot.sort_values(by='pearson_diff', ascending = False)
    return dataplot
get_preprocessed_diff(retrieve_results(chosen_language='ptbr')).sample(10)

Unnamed: 0,dataset_language_x,word_embedding,language_model,word_embedding_architecture,word_embedding_size,unk,preprocessed_x,pearson_x,MSE_x,dataset_language_y,preprocessed_y,pearson_y,MSE_y,pearson_diff
43,ptbr,wang2vec,,skip-gram,50,True,True,0.483,0.583,ptbr,False,0.436,0.616,0.047
51,ptbr,wang2vec,,CBOW,1000,True,True,0.45,0.607,ptbr,False,0.446,0.609,0.004
21,ptbr,wang2vec,,CBOW,100,True,True,0.543,0.537,ptbr,False,0.49,0.578,0.053
37,ptbr,glove,,,1000,False,True,0.495,0.575,ptbr,False,0.396,0.642,0.099
62,ptbr,fasttext,,CBOW,600,True,True,0.368,0.658,ptbr,False,0.36,0.662,0.008
29,ptbr,wang2vec,,skip-gram,100,True,True,0.52,0.556,ptbr,False,0.464,0.596,0.056
48,ptbr,word2vec,,CBOW,50,True,True,0.458,0.601,ptbr,False,0.429,0.62,0.029
60,ptbr,fasttext,,CBOW,300,True,True,0.372,0.656,ptbr,False,0.347,0.668,0.025
39,ptbr,glove,,,600,True,True,0.492,0.578,ptbr,False,0.343,0.671,0.149
55,ptbr,wang2vec,,CBOW,600,True,True,0.443,0.611,ptbr,False,0.44,0.613,0.003


In [4]:
get_preprocessed_diff(retrieve_results(chosen_language='ptbr')).to_csv('diff_preprocessing_ptbr.csv')
get_preprocessed_diff(retrieve_results(chosen_language='pteu')).to_csv('diff_preprocessing_pteu.csv')

In [5]:
def diff_unk(df, status=True):
    tmp_df = df[ df['preprocessed'] == status ]
    tmp_df = tmp_df[tmp_df['language_model'].isnull()]
    dataplot = tmp_df.merge(tmp_df, how='inner', on=['word_embedding', 'language_model', 'word_embedding_architecture', 'word_embedding_size', 'preprocessed'])
    dataplot['same'] = dataplot['pearson_x'].combine(dataplot['pearson_y'], lambda x, y: True if x == y else False)
    dataplot = dataplot[dataplot['same'] == False]
    dataplot = dataplot.drop(columns='same')
    dataplot['pearson_diff'] = dataplot['pearson_x'] - dataplot['pearson_y']
    dataplot = dataplot.sort_values(by='pearson_diff', ascending=False)
    dataplot['pearson_diff'] = dataplot['pearson_diff'].apply(lambda x: round(x, 3))
    return dataplot

diff_unk(retrieve_results(chosen_language='pteu'), status=True).sample(10)

Unnamed: 0,dataset_language_x,word_embedding,language_model,word_embedding_architecture,word_embedding_size,unk_x,preprocessed,pearson_x,MSE_x,dataset_language_y,unk_y,pearson_y,MSE_y,pearson_diff
117,pteu,glove,,,50,False,True,0.405,0.98,pteu,True,0.404,0.98,0.001
134,pteu,fasttext,,CBOW,100,False,True,0.353,1.033,pteu,True,0.354,1.032,-0.001
62,pteu,wang2vec,,skip-gram,300,False,True,0.518,0.858,pteu,True,0.519,0.858,-0.001
133,pteu,fasttext,,CBOW,100,True,True,0.354,1.032,pteu,False,0.353,1.033,0.001
90,pteu,fasttext,,skip-gram,50,True,True,0.469,0.924,pteu,False,0.477,0.915,-0.008
137,pteu,fasttext,,CBOW,50,True,True,0.342,1.042,pteu,False,0.341,1.043,0.001
89,pteu,fasttext,,skip-gram,50,False,True,0.477,0.915,pteu,True,0.469,0.924,0.008
53,pteu,wang2vec,,skip-gram,600,True,True,0.523,0.853,pteu,False,0.522,0.853,0.001
129,pteu,fasttext,,CBOW,300,True,True,0.367,1.021,pteu,False,0.366,1.022,0.001
118,pteu,glove,,,50,True,True,0.404,0.98,pteu,False,0.405,0.98,-0.001


In [6]:
diff_unk(retrieve_results(chosen_language='pteu'), status=True).to_csv('unk_diff_preprocessed_pteu.csv')
diff_unk(retrieve_results(chosen_language='ptbr'), status=True).to_csv('unk_diff_preprocessed_ptbr.csv')
diff_unk(retrieve_results(chosen_language='pteu'), status=False).to_csv('unk_diff_not_preprocessed_pteu.csv')
diff_unk(retrieve_results(chosen_language='ptbr'), status=False).to_csv('unk_diff_not_preprocessed_ptbr.csv')