In [30]:
import json
import pandas as pd
from pathlib import Path
from pprint import pprint
import datetime
import time
from scipy import stats
import numpy as np
import matplotlib as plt

import pandas
import researchpy as rp
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp

%matplotlib inline

pd.set_option('display.max_rows', 10000)

In [2]:
TRAINING_ROUND = 'Training_Round'
EPOCHS = 'Epochs'
SCENARIO = 'Scenario'
EMBEDDING = 'Embedding'
TAGGING_SCHEME = 'Tagging_Scheme'
CAP_DIM = 'Capitalisation_Dim'
LOWER = 'Lower'
CHAR_DIM = 'Char_LSTM_Dim'
WORD_DIM = 'Word_LSTM_Dim'
PRECISION = 'Precision'
RECALL = 'Recall'
F1_SCORE = 'F1_Score'

In [3]:
columns = [TRAINING_ROUND, EPOCHS, SCENARIO, EMBEDDING, TAGGING_SCHEME, CAP_DIM, LOWER, CHAR_DIM, WORD_DIM, PRECISION, RECALL, F1_SCORE]

In [4]:
columns

['Training_Round',
 'Epochs',
 'Scenario',
 'Embedding',
 'Tagging_Scheme',
 'Capitalisation_Dim',
 'Lower',
 'Char_LSTM_Dim',
 'Word_LSTM_Dim',
 'Precision',
 'Recall',
 'F1_Score']

In [5]:
pd.set_option('display.max_columns', 1000)
training_data_df = pd.read_csv('resultados_sem_outliers.csv')
training_data_df

Unnamed: 0.1,Unnamed: 0,Training_Round,Epochs,Scenario,Embedding,Tagging_Scheme,Capitalisation_Dim,Lower,Char_LSTM_Dim,Word_LSTM_Dim,Precision,Recall,F1_Score
0,320,0,5,selective,FastText,IOB2,False,False,25,100,58.83,56.51,57.64
1,352,1,5,selective,FastText,IOB2,False,False,25,100,64.86,50.77,56.96
2,384,2,5,selective,FastText,IOB2,False,False,25,100,63.22,51.77,56.93
3,416,3,5,selective,FastText,IOB2,False,False,25,100,59.05,56.76,57.88
4,448,4,5,selective,FastText,IOB2,False,False,25,100,58.87,55.11,56.93
5,480,5,5,selective,FastText,IOB2,False,False,25,100,59.32,48.73,53.50
6,512,6,5,selective,FastText,IOB2,False,False,25,100,59.65,50.99,54.98
7,544,7,5,selective,FastText,IOB2,False,False,25,100,63.90,50.59,56.47
8,576,8,5,selective,FastText,IOB2,False,False,25,100,59.73,56.47,58.05
9,608,9,5,selective,FastText,IOB2,False,False,25,100,62.26,55.36,58.61


In [6]:
training_data_df = training_data_df[training_data_df.Scenario == 'selective']
len(training_data_df)

1157

In [7]:
training_data_df.groupby([TAGGING_SCHEME]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Tagging_Scheme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IOB2,583.0,57.993756,5.794382,44.34,54.47,56.26,63.765,68.26
IOBES,574.0,57.959739,5.84919,42.97,54.56,56.635,63.5975,67.99


In [8]:
training_data_df.groupby([EMBEDDING]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Embedding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Wang2Vec,286.0,61.274895,5.270895,51.42,56.26,58.58,66.4775,68.26
FastText,280.0,60.694143,4.48011,53.42,56.495,58.81,65.175,66.9
Glove,287.0,58.839861,3.973881,50.18,55.135,60.31,62.71,64.89
Word2Vec,304.0,51.556678,3.243757,42.97,48.9725,51.92,54.5,56.92


In [9]:
training_data_df.groupby([CAP_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Capitalisation_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
True,583.0,58.127118,5.861585,43.81,54.575,56.61,63.84,68.26
False,574.0,57.824286,5.776824,42.97,54.4625,56.275,63.43,68.07


In [10]:
training_data_df.groupby([CHAR_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Char_LSTM_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
50,571.0,58.048319,5.750252,45.04,54.65,56.53,63.69,68.07
25,586.0,57.90727,5.889573,42.97,54.34,56.425,63.5925,68.26


In [11]:
training_data_df.groupby([WORD_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Word_LSTM_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100,582.0,58.150722,5.768651,42.97,54.7075,56.645,63.86,68.26
200,575.0,57.800922,5.869582,43.81,54.28,56.36,63.565,68.07


In [12]:
training_data_df.groupby([LOWER]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Lower,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
True,581.0,60.439897,7.293584,42.97,51.5,63.61,65.86,68.26
False,576.0,55.492483,1.451393,50.18,54.57,55.58,56.535,59.56


In [13]:
training_data_df.groupby([EMBEDDING, TAGGING_SCHEME, CAP_DIM, WORD_DIM, CHAR_DIM, LOWER]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count,mean,std,min,25%,50%,75%,max
Embedding,Tagging_Scheme,Capitalisation_Dim,Word_LSTM_Dim,Char_LSTM_Dim,Lower,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Wang2Vec,IOB2,False,100,50,True,10.0,66.901000,0.721641,65.44,66.4250,67.105,67.4925,67.69
Wang2Vec,IOB2,True,100,25,True,9.0,66.890000,1.152801,64.87,66.2000,66.920,67.8600,68.26
Wang2Vec,IOB2,False,200,50,True,6.0,66.818333,1.050950,65.21,66.2075,67.155,67.3600,68.07
Wang2Vec,IOBES,True,200,25,True,10.0,66.788000,0.771432,65.54,66.2750,66.845,67.2000,67.99
Wang2Vec,IOBES,True,100,50,True,9.0,66.758889,0.517650,66.25,66.4200,66.630,66.7300,67.79
Wang2Vec,IOB2,True,100,50,True,10.0,66.668000,0.790974,65.00,66.3300,66.755,67.2925,67.64
Wang2Vec,IOB2,True,200,50,True,9.0,66.595556,0.441902,65.94,66.2800,66.650,66.8200,67.22
Wang2Vec,IOBES,True,100,25,True,10.0,66.550000,1.105039,63.99,66.2625,66.440,67.2775,67.87
Wang2Vec,IOB2,True,200,25,True,7.0,66.380000,0.727805,65.26,66.0700,66.310,66.6900,67.57
Wang2Vec,IOBES,False,100,50,True,7.0,66.371429,0.869127,64.95,65.8950,66.650,66.8150,67.58


In [14]:
training_data_df.groupby([TRAINING_ROUND]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Training_Round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,121.0,58.226942,5.917851,44.08,54.88,56.53,64.27,68.07
9,115.0,58.137391,5.631239,46.94,54.75,56.53,63.51,67.87
5,111.0,58.067748,5.917702,47.59,54.61,56.44,63.99,67.86
6,119.0,58.048824,5.825342,45.04,54.92,56.71,63.575,67.74
4,114.0,58.021404,5.518434,47.28,54.5975,56.72,63.67,68.26
8,116.0,57.981207,5.754974,46.61,54.2375,56.295,63.7225,67.73
1,118.0,57.963644,5.883625,46.26,54.57,56.04,63.405,67.99
3,113.0,57.950973,6.073921,44.33,54.31,56.29,63.9,67.79
0,121.0,57.923636,5.849924,42.97,54.57,56.59,63.18,67.57
7,109.0,57.407982,5.997139,43.81,53.77,56.15,63.04,67.53


In [15]:
rp.summary_cont(training_data_df[F1_SCORE])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,F1_Score,1157.0,57.97688,5.819143,0.171077,57.641223,58.312537


In [16]:
rp.summary_cont(training_data_df.groupby(EMBEDDING))[F1_SCORE]





Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
Embedding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FastText,280,60.694143,4.48011,0.267738,60.169377,61.218909
Glove,287,58.839861,3.973881,0.234571,58.380102,59.299619
Wang2Vec,286,61.274895,5.270895,0.311675,60.664013,61.885777
Word2Vec,304,51.556678,3.243757,0.186042,51.192035,51.92132


In [17]:
print(F1_SCORE, EMBEDDING, TAGGING_SCHEME, CAP_DIM, CHAR_DIM, WORD_DIM, LOWER)

F1_Score Embedding Tagging_Scheme Capitalisation_Dim Char_LSTM_Dim Word_LSTM_Dim Lower


In [33]:
# Fits the model with the interaction term
# This will also automatically include the main effects for each factor
model = ols('F1_Score ~ C(Embedding)*C(Tagging_Scheme)*C(Capitalisation_Dim)*C(Char_LSTM_Dim)*C(Word_LSTM_Dim)*C(Lower)', training_data_df).fit()

# Seeing if the overall model is significant
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .20f}")

Overall model F( 127, 1029) =  202.677, p =  0.00000000000000000000


In [34]:
model.mse_model, model.mse_resid, model.mse_total, model.f_pvalue

(296.3797916697224, 1.4623246785675723, 33.86242702102144, 0.0)

In [35]:
model.summary()

0,1,2,3
Dep. Variable:,F1_Score,R-squared:,0.962
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,202.7
Date:,"Wed, 10 Apr 2019",Prob (F-statistic):,0.0
Time:,11:59:55,Log-Likelihood:,-1793.7
No. Observations:,1157,AIC:,3843.0
Df Residuals:,1029,BIC:,4490.0
Df Model:,127,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,56.7950,0.382,148.521,0.000,56.045,57.545
C(Embedding)[T.Glove],-1.1490,0.541,-2.125,0.034,-2.210,-0.088
C(Embedding)[T.Wang2Vec],-1.1730,0.541,-2.169,0.030,-2.234,-0.112
C(Embedding)[T.Word2Vec],-2.2372,0.556,-4.027,0.000,-3.327,-1.147
C(Tagging_Scheme)[T.IOBES],-0.8836,0.596,-1.483,0.138,-2.053,0.286
C(Capitalisation_Dim)[T.True],-0.5390,0.541,-0.997,0.319,-1.600,0.522
C(Char_LSTM_Dim)[T.50],-0.4817,0.556,-0.867,0.386,-1.572,0.609
C(Word_LSTM_Dim)[T.200],-1.3887,0.574,-2.421,0.016,-2.514,-0.263
C(Lower)[T.True],8.4640,0.541,15.651,0.000,7.403,9.525

0,1,2,3
Omnibus:,141.377,Durbin-Watson:,2.248
Prob(Omnibus):,0.0,Jarque-Bera (JB):,374.338
Skew:,-0.654,Prob(JB):,5.17e-82
Kurtosis:,5.461,Cond. No.,594.0


In [36]:
res = sm.stats.anova_lm(model, typ= 2)

In [37]:
res

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Embedding),18097.602151,3.0,4125.30414,0.0
C(Tagging_Scheme),1.857111,1.0,1.269972,0.260035
C(Capitalisation_Dim),7.036934,1.0,4.812156,0.028483
C(Char_LSTM_Dim),9.587457,1.0,6.556312,0.010593
C(Word_LSTM_Dim),35.019032,1.0,23.947508,1e-06
C(Lower),7231.789826,1.0,4945.406401,0.0
C(Embedding):C(Tagging_Scheme),9.46423,3.0,2.157348,0.091458
C(Embedding):C(Capitalisation_Dim),9.726866,3.0,2.217215,0.084545
C(Tagging_Scheme):C(Capitalisation_Dim),0.050563,1.0,0.034577,0.852521
C(Embedding):C(Char_LSTM_Dim),20.452179,3.0,4.662024,0.003046


In [23]:
# Calculating effect size
def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq']/aov[:]['df']
    
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*aov['mean_sq'][-1]))/(sum(aov['sum_sq'])+aov['mean_sq'][-1])
    
    cols = ['sum_sq', 'mean_sq', 'df', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov

anova_table(res)

Unnamed: 0,sum_sq,mean_sq,df,F,PR(>F),eta_sq,omega_sq
C(Embedding),18097.602151,6032.534050,3.0,4125.304140,0.000000,4.620814e-01,4.619521e-01
C(Tagging_Scheme),1.857111,1.857111,1.0,1.269972,0.260035,4.741713e-05,1.007960e-05
C(Capitalisation_Dim),7.036934,7.036934,1.0,4.812156,0.028483,1.796722e-04,1.423297e-04
C(Char_LSTM_Dim),9.587457,9.587457,1.0,6.556312,0.010593,2.447941e-04,2.074492e-04
C(Word_LSTM_Dim),35.019032,35.019032,1.0,23.947508,0.000001,8.941318e-04,8.567627e-04
C(Lower),7231.789826,7231.789826,1.0,4945.406401,0.000000,1.846474e-01,1.846032e-01
C(Embedding):C(Tagging_Scheme),9.464230,3.154743,3.0,2.157348,0.091458,2.416477e-04,1.296314e-04
C(Embedding):C(Capitalisation_Dim),9.726866,3.242289,3.0,2.217215,0.084545,2.483535e-04,1.363370e-04
C(Tagging_Scheme):C(Capitalisation_Dim),0.050563,0.050563,1.0,0.034577,0.852521,1.291019e-06,-3.604479e-05
C(Embedding):C(Char_LSTM_Dim),20.452179,6.817393,3.0,4.662024,0.003046,5.222002e-04,4.101734e-04


In [24]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[EMBEDDING])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower    upper  reject
--------------------------------------------------
FastText  Glove   -1.8543  -2.7815  -0.9271  True 
FastText Wang2Vec  0.5808  -0.3472   1.5087 False 
FastText Word2Vec -9.1375  -10.0517 -8.2232  True 
 Glove   Wang2Vec  2.435    1.5128   3.3573  True 
 Glove   Word2Vec -7.2832  -8.1916  -6.3747  True 
Wang2Vec Word2Vec -9.7182  -10.6275 -8.8089  True 
--------------------------------------------------


In [25]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[CAP_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
False   True   0.3028  -0.3686 0.9742 False 
--------------------------------------------


In [26]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[LOWER])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff lower  upper  reject
-------------------------------------------
False   True   4.9474  4.3396 5.5553  True 
-------------------------------------------


In [27]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[TAGGING_SCHEME])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
 IOB2  IOBES   -0.034  -0.7056 0.6376 False 
--------------------------------------------


In [28]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[CHAR_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
  25     50    0.141   -0.5306 0.8127 False 
--------------------------------------------


In [29]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[WORD_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
 100    200   -0.3498  -1.0211 0.3215 False 
--------------------------------------------
