In [None]:
import string
import re
import collections
import csv
import pandas as pd
import numpy as np
import requests
import os
import json
from hyphen import Hyphenator

import dask.dataframe as dd
import multiprocessing

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

import seaborn as sns

%matplotlib inline

import stanfordnlp
from stanfordnlp.server import CoreNLPClient

# Uncomment if needed to fix this error:
# OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
PREDICTION_PATH = './predictions'
TEST_SETS_PATH = './test_sets'

#SET_NAMES = ['Amazon', 'Reddit', 'New-Wiki', 'NYT', 'dev-v1.1']
SET_NAMES = ['Amazon', 'Reddit', 'New-Wiki', 'NYT']

In [None]:
def load_data(input_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        return [r for r in csv_reader]

In [None]:
df_answers = pd.read_csv('answers.csv')
df_distinct_answers = pd.read_csv('distinct_answers.csv')
df_distinct_context = pd.read_csv('distinct_context.csv')
df_merged_answers = pd.read_csv('merged_answers.com')
df_merged_answers_and_context = pd.read_csv('merged_answers_and_context.csv')
df_pred_answers_context = pd.read_csv('pred_answers_context.csv')

df_pred = pd.DataFrameload_data(PREDICTION_PATH + '/all_predictions.csv')

In [None]:
df_pred = df_pred.astype({'f1': 'float'})


In [None]:
df_pred['f1'].hist(by=df_pred['test_set'],density=True)

In [None]:
fig, axes = plt.subplots(1,4, figsize= (20,5))
fig.suptitle('Test Set F1 Histogram by %')
for i, test_set in enumerate(df_pred['test_set'].unique()):
    data = df_pred[df_pred['test_set'] == test_set]['f1']
    data.hist(ax=axes.flat[i], 
            bins=20, 
            weights=np.ones(len(data)) / len(data) 
           )
    axes.flat[i].set_title(test_set)
for ax in axes:
    ax.yaxis.set_major_formatter(PercentFormatter(1))

In [None]:
fig, axes = plt.subplots(1,4, figsize= (20,5))
fig.suptitle('Test Set F1 Histogram by Count')
for i, test_set in enumerate(df_pred['test_set'].unique()):
    data = df_pred[df_pred['test_set'] == test_set]['f1']
    data.hist(ax=axes.flat[i], 
            bins=20
           )
    axes.flat[i].set_title(test_set)

In [None]:
df_pred[df_pred['test_set'] == 'dev-v1.1']['f1'].hist(bins=1)

In [None]:
fig, axes = plt.subplots(1,1)
axes.hist(df_pred[df_pred['test_set'] == 'Amazon']['f1'], 
          weights=np.ones(len(df_pred[df_pred['test_set'] == 'Amazon']['f1'])) / len(df_pred[df_pred['test_set'] == 'Amazon']['f1']) )
plt.show()

In [None]:
df_distinct_context.sort_values('flesch-kincaid_grade_level', ascending=False).iloc[2]['context']

In [None]:
df_distinct_context.iloc[1519]

In [None]:
df_distinct_context.sort_values(
    'flesch-kincaid_grade_level', 
    ascending=False).iloc[2:3][[
                                'sentence_count', 
                                'word_count', 
                                'flesch-kincaid_grade_level', 
                                'coleman-liau', 
                                'gunning-fog', 
                                'automated-readability']]

In [None]:
len(nlp(df_distinct_context.sort_values('flesch-kincaid_grade_level', ascending=False).iloc[2]['context']).sentences)

In [None]:
df_distinct_context.sort_values('flesch-kincaid_grade_level', ascending=False)

In [None]:
df_distinct_context.sort_values('flesch-kincaid_grade_level', ascending=False).iloc[1]

In [None]:
x = 'University of the Philippines College of Medicine'.split()
for w in x:
    print(w, Hyphenator('en_US').syllables(w))

In [None]:
df_merged_answers[df_merged_answers['question_id'] == '5dd469facc027a086d65bf1a']

In [None]:
df_merged_answers[df_merged_answers['is_numeric']].groupby('first_parse').count()

In [None]:
df_pred_answers_context[(df_pred_answers_context['test_set_x'] == 'Amazon') & (df_pred_answers_context['model_name'] == 'xlnet-123(singlemodel)')][['f1','syllables_per_word']].corr()

In [None]:
fig, axes = plt.subplots(1,4, figsize= (22,5))

metric_list = ['f1','syllables_per_word', 'polysyllable_count','avg_word_length','avg_sentence_length_in_words','context_character_count',
                                          'avg_sentence_length_in_characters','flesch-kincaid_grade_level','coleman-liau','gunning-fog','automated-readability']
metric_list = ['f1', 'flesch-kincaid_grade_level','coleman-liau','gunning-fog','automated-readability']
for i, model in enumerate(SET_NAMES):
    corrMatrix = df_pred_answers_context[(df_pred_answers_context['test_set_x'] == model)][metric_list].corr()
#     print(model)
#    print(corrMatrix[0])
#     print('\n')
    
    p = sns.heatmap(corrMatrix, annot=True, ax = axes.flat[i], fmt=".2f")
    axes.flat[i].set_title(model)

    #p.set_xticklabels(p.get_xticklabels(), rotation = 0)
    
fig.tight_layout()
#plt.savefig('test.jpg')   

In [None]:
fig, axes = plt.subplots(1,4, figsize= (22,5))

metric_list = ['exact_match','syllables_per_word', 'polysyllable_count','avg_word_length','avg_sentence_length_in_words','context_character_count',
                                          'avg_sentence_length_in_characters','flesch-kincaid_grade_level','coleman-liau','gunning-fog','automated-readability']
metric_list = ['exact_match', 'flesch-kincaid_grade_level','coleman-liau','gunning-fog','automated-readability']
for i, model in enumerate(SET_NAMES):
    corrMatrix = df_pred_answers_context[(df_pred_answers_context['test_set_x'] == model)][metric_list].corr()
#     print(model)
#    print(corrMatrix[0])
#     print('\n')
    
    p = sns.heatmap(corrMatrix, annot=True, ax = axes.flat[i], fmt=".2f")
    axes.flat[i].set_title(model)

    #p.set_xticklabels(p.get_xticklabels(), rotation = 0)
    
fig.tight_layout()
#plt.savefig('test.jpg') 

In [None]:
pd.set_option('display.max_rows', 20)
df_pred_answers_context[df_pred_answers_context['context'] == df_distinct_context.sort_values('flesch-kincaid_grade_level', ascending=False).iloc[2]['context']][['question_text', 'answer_text', 'predicted_answer', 'f1', 'exact_match']].drop_duplicates()