In [1]:
import json
import pandas as pd
from pathlib import Path
from pprint import pprint
import datetime
import time
from scipy import stats
import numpy as np
import matplotlib as plt

import pandas
import researchpy as rp
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp

%matplotlib inline

In [2]:
TRAINING_ROUND = 'Training_Round'
EPOCHS = 'Epochs'
SCENARIO = 'Scenario'
EMBEDDING = 'Embedding'
TAGGING_SCHEME = 'Tagging_Scheme'
CAP_DIM = 'Capitalisation_Dim'
LOWER = 'Lower'
CHAR_DIM = 'Char_LSTM_Dim'
WORD_DIM = 'Word_LSTM_Dim'
PRECISION = 'Precision'
RECALL = 'Recall'
F1_SCORE = 'F1_Score'

In [3]:
columns = [TRAINING_ROUND, EPOCHS, SCENARIO, EMBEDDING, TAGGING_SCHEME, CAP_DIM, LOWER, CHAR_DIM, WORD_DIM, PRECISION, RECALL, F1_SCORE]

In [4]:
columns

['Training_Round',
 'Epochs',
 'Scenario',
 'Embedding',
 'Tagging_Scheme',
 'Capitalisation_Dim',
 'Lower',
 'Char_LSTM_Dim',
 'Word_LSTM_Dim',
 'Precision',
 'Recall',
 'F1_Score']

In [5]:
pd.set_option('display.max_columns', 1000)
training_data_df = pd.read_csv('resultados.csv')
training_data_df 

Unnamed: 0,Training_Round,Epochs,Scenario,Embedding,Tagging_Scheme,Capitalisation_Dim,Lower,Char_LSTM_Dim,Word_LSTM_Dim,Precision,Recall,F1_Score
0,0,5,selective,Wang2Vec,iob,False,False,25,100,58.95,54.79,56.79
1,0,5,selective,Wang2Vec,iob,False,False,25,200,61.60,52.06,56.43
2,0,5,selective,Wang2Vec,iob,False,False,50,100,66.50,47.47,55.40
3,0,5,selective,Wang2Vec,iob,False,False,50,200,58.54,50.74,54.36
4,0,5,selective,Wang2Vec,iob,False,True,25,100,66.28,67.73,67.00
5,0,5,selective,Wang2Vec,iob,False,True,25,200,71.00,62.67,66.58
6,0,5,selective,Wang2Vec,iob,False,True,50,100,70.14,65.18,67.57
7,0,5,selective,Wang2Vec,iob,False,True,50,200,66.93,63.57,65.21
8,0,5,selective,Wang2Vec,iob,True,False,25,100,58.62,57.55,58.08
9,0,5,selective,Wang2Vec,iob,True,False,25,200,64.42,50.45,56.59


In [6]:
training_data_df = training_data_df[training_data_df.Lower == True]
len(training_data_df)

640

In [7]:
z = np.abs(stats.zscore(training_data_df[F1_SCORE]))
print(z)
training_data_df = training_data_df[pd.Series([False if el > 3 else True for el in z]).values]
training_data_df

[0.70267842 0.6721077  0.74416725 0.57238893 0.64444848 0.63789761
 0.68448157 0.66555684 0.59640878 0.629891   0.67865857 4.03723865
 0.6495436  0.70195055 0.68302582 0.65027148 3.79995165 0.6975833
 0.65391085 0.70558992 0.77255435 0.01629301 0.68957669 0.67720282
 0.66555684 0.69248818 1.14612216 0.7710986  0.72815402 0.77473797
 0.64808785 0.60223178 4.17407901 0.66701258 0.71359654 0.78056097
 0.69685543 0.74416725 0.73907213 0.71359654 0.59422516 0.74635088
 0.67720282 0.70558992 0.48358827 0.75799686 0.67574707 0.67647495
 0.61387776 0.59131366 0.66191746 0.62479588 0.54764121 0.6975833
 0.65318297 0.68957669 0.64444848 4.10493096 3.43091941 0.64153699
 0.66482896 0.72233103 0.76018049 0.66846833 0.65682234 3.71697398
 0.70704567 3.38360758 0.79439058 1.49186242 0.73543276 0.68229794
 4.05325188 0.60951052 0.63716974 0.5789398  0.70704567 0.66410109
 0.4573848  0.66191746 0.7034063  0.65755022 0.75290174 4.13404593
 0.76527561 0.6524551  0.63571399 0.71869165 0.69248818 0.650999

Unnamed: 0,Training_Round,Epochs,Scenario,Embedding,Tagging_Scheme,Capitalisation_Dim,Lower,Char_LSTM_Dim,Word_LSTM_Dim,Precision,Recall,F1_Score
4,0,5,selective,Wang2Vec,iob,False,True,25,100,66.28,67.73,67.00
5,0,5,selective,Wang2Vec,iob,False,True,25,200,71.00,62.67,66.58
6,0,5,selective,Wang2Vec,iob,False,True,50,100,70.14,65.18,67.57
7,0,5,selective,Wang2Vec,iob,False,True,50,200,66.93,63.57,65.21
12,0,5,selective,Wang2Vec,iob,True,True,25,100,70.32,62.53,66.20
13,0,5,selective,Wang2Vec,iob,True,True,25,200,69.64,62.93,66.11
14,0,5,selective,Wang2Vec,iob,True,True,50,100,69.42,64.29,66.75
15,0,5,selective,Wang2Vec,iob,True,True,50,200,68.37,64.72,66.49
20,0,5,selective,Wang2Vec,iobes,False,True,25,100,67.04,64.11,65.54
21,0,5,selective,Wang2Vec,iobes,False,True,25,200,69.78,62.60,66.00


In [8]:
training_data_df.groupby([TAGGING_SCHEME]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Tagging_Scheme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
iobes,307.0,59.534723,8.387969,22.55,50.21,63.39,65.74,67.99
iob,308.0,59.473766,9.106533,16.37,50.8175,63.395,65.705,68.26


In [9]:
training_data_df.groupby([EMBEDDING]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Embedding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Wang2Vec,150.0,65.047333,6.762089,17.74,65.945,66.445,66.9825,68.26
FastText,151.0,63.687483,6.203814,22.55,64.29,65.07,65.72,67.05
Glove,154.0,61.499545,5.865458,16.37,61.69,62.64,63.285,64.89
Word2Vec,160.0,48.439,3.254962,16.37,47.49,48.95,49.945,52.42


In [10]:
training_data_df.groupby([CAP_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Capitalisation_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
True,314.0,59.734713,8.585693,16.37,50.325,63.55,66.0275,68.26
False,301.0,59.263721,8.922388,16.37,50.67,63.18,65.6,68.07


In [11]:
training_data_df.groupby([CHAR_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Char_LSTM_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
25,307.0,59.880391,8.112384,16.37,50.63,63.39,65.725,68.26
50,308.0,59.129221,9.337059,16.37,50.3325,63.395,65.74,68.07


In [12]:
training_data_df.groupby([WORD_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Word_LSTM_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100,310.0,59.744903,8.432518,22.55,50.56,63.46,65.925,68.26
200,305.0,59.259541,9.064992,16.37,50.43,63.39,65.61,68.07


In [13]:
training_data_df.groupby([EMBEDDING, TAGGING_SCHEME, CAP_DIM, WORD_DIM, CHAR_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,mean,std,min,25%,50%,75%,max
Embedding,Tagging_Scheme,Capitalisation_Dim,Word_LSTM_Dim,Char_LSTM_Dim,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Wang2Vec,iob,False,100,50,10.0,66.901000,0.721641,65.44,66.4250,67.105,67.4925,67.69
Wang2Vec,iobes,True,200,25,10.0,66.788000,0.771432,65.54,66.2750,66.845,67.2000,67.99
Wang2Vec,iob,True,100,50,10.0,66.668000,0.790974,65.00,66.3300,66.755,67.2925,67.64
Wang2Vec,iobes,True,100,25,10.0,66.550000,1.105039,63.99,66.2625,66.440,67.2775,67.87
Wang2Vec,iobes,True,100,50,10.0,66.446000,1.103260,63.63,66.3150,66.630,66.7175,67.79
Wang2Vec,iobes,True,200,50,10.0,66.328000,0.419571,65.60,66.2125,66.460,66.6125,66.78
Wang2Vec,iobes,False,200,50,9.0,66.315556,0.910647,64.87,66.1600,66.180,66.8500,67.94
Wang2Vec,iobes,False,100,25,9.0,66.250000,0.560134,65.51,65.6300,66.470,66.6300,66.92
Wang2Vec,iob,False,200,25,9.0,66.227778,0.446760,65.47,65.8600,66.250,66.5100,66.93
Wang2Vec,iob,False,100,25,8.0,66.218750,1.454701,63.18,65.6825,66.685,67.0775,67.74


In [14]:
training_data_df.groupby([TRAINING_ROUND]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Training_Round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,62.0,60.24629,7.522781,41.6,51.29,63.39,66.1475,67.99
2,62.0,60.017419,9.290978,16.37,52.24,64.155,66.1575,68.07
3,61.0,59.762295,8.852868,24.32,50.19,63.61,65.93,67.79
0,62.0,59.738065,8.352787,29.93,51.02,63.175,65.8275,67.57
6,62.0,59.645645,8.484412,27.58,50.205,63.415,65.58,67.74
5,60.0,59.601167,8.766373,24.06,49.7725,63.35,66.0975,67.86
7,60.0,59.4485,7.581877,43.81,50.4725,62.75,65.62,67.53
8,64.0,59.151562,9.073109,22.55,50.57,63.19,65.4825,67.73
4,59.0,58.812034,10.009711,16.37,50.545,63.63,65.58,68.26
9,63.0,58.616667,9.716235,17.74,50.22,63.12,65.76,67.87


In [15]:
import pandas
import researchpy as rp
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp

In [16]:
rp.summary_cont(training_data_df[F1_SCORE])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,F1_Score,615.0,59.504195,8.748133,0.352759,58.811435,60.196955


In [17]:
rp.summary_cont(training_data_df.groupby(EMBEDDING))[F1_SCORE]





Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
Embedding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FastText,151,63.687483,6.203814,0.504859,62.697959,64.677008
Glove,154,61.499545,5.865458,0.472652,60.573147,62.425943
Wang2Vec,150,65.047333,6.762089,0.552122,63.965174,66.129493
Word2Vec,160,48.439,3.254962,0.257327,47.934638,48.943362


In [18]:
print(F1_SCORE, EMBEDDING, TAGGING_SCHEME, CAP_DIM, CHAR_DIM, WORD_DIM)

F1_Score Embedding Tagging_Scheme Capitalisation_Dim Char_LSTM_Dim Word_LSTM_Dim


In [19]:
# Fits the model with the interaction term
# This will also automatically include the main effects for each factor
model = ols('F1_Score ~ C(Embedding)*C(Tagging_Scheme)*C(Capitalisation_Dim)*C(Char_LSTM_Dim)*C(Word_LSTM_Dim)', training_data_df).fit()

# Seeing if the overall model is significant
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .20f}")

Overall model F( 63, 551) =  14.395, p =  0.00000000000000000000


In [20]:
model.summary()

0,1,2,3
Dep. Variable:,F1_Score,R-squared:,0.622
Model:,OLS,Adj. R-squared:,0.579
Method:,Least Squares,F-statistic:,14.39
Date:,"Tue, 09 Apr 2019",Prob (F-statistic):,2.05e-81
Time:,20:42:35,Log-Likelihood:,-1906.8
No. Observations:,615,AIC:,3942.0
Df Residuals:,551,BIC:,4225.0
Df Model:,63,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,65.2590,1.795,36.350,0.000,61.733,68.785
C(Embedding)[T.Glove],-2.5534,2.609,-0.979,0.328,-7.677,2.570
C(Embedding)[T.Wang2Vec],0.9598,2.693,0.356,0.722,-4.330,6.249
C(Embedding)[T.Word2Vec],-15.6220,2.539,-6.153,0.000,-20.609,-10.635
C(Tagging_Scheme)[T.iobes],-0.8910,2.539,-0.351,0.726,-5.878,4.096
C(Capitalisation_Dim)[T.True],0.5770,2.539,0.227,0.820,-4.410,5.564
C(Char_LSTM_Dim)[T.50],-1.8868,2.609,-0.723,0.470,-7.011,3.237
C(Word_LSTM_Dim)[T.200],-3.1368,2.609,-1.203,0.230,-8.261,1.987
C(Embedding)[T.Glove]:C(Tagging_Scheme)[T.iobes],-0.5446,3.689,-0.148,0.883,-7.791,6.702

0,1,2,3
Omnibus:,628.102,Durbin-Watson:,2.095
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23205.834
Skew:,-4.794,Prob(JB):,0.0
Kurtosis:,31.525,Cond. No.,227.0


In [21]:
res = sm.stats.anova_lm(model, typ= 2)

In [22]:
res

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Embedding),27459.467777,3.0,283.982058,2.201742e-111
C(Tagging_Scheme),0.760611,1.0,0.023598,0.8779674
C(Capitalisation_Dim),12.855036,1.0,0.398835,0.5279534
C(Char_LSTM_Dim),87.003661,1.0,2.69934,0.1009603
C(Word_LSTM_Dim),27.854516,1.0,0.864203,0.3529723
C(Embedding):C(Tagging_Scheme),117.904354,3.0,1.219351,0.3019124
C(Embedding):C(Capitalisation_Dim),1.966871,3.0,0.020341,0.9960584
C(Tagging_Scheme):C(Capitalisation_Dim),80.739923,1.0,2.505004,0.114059
C(Embedding):C(Char_LSTM_Dim),155.475297,3.0,1.607904,0.1864818
C(Tagging_Scheme):C(Char_LSTM_Dim),8.241803,1.0,0.255707,0.6132873


In [23]:
# Calculating effect size
def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq']/aov[:]['df']
    
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*aov['mean_sq'][-1]))/(sum(aov['sum_sq'])+aov['mean_sq'][-1])
    
    cols = ['sum_sq', 'mean_sq', 'df', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov

anova_table(res)

Unnamed: 0,sum_sq,mean_sq,df,F,PR(>F),eta_sq,omega_sq
C(Embedding),27459.467777,9153.155926,3.0,283.982058,2.201742e-111,0.584314,0.581857
C(Tagging_Scheme),0.760611,0.760611,1.0,0.023598,0.8779674,1.6e-05,-0.000669
C(Capitalisation_Dim),12.855036,12.855036,1.0,0.398835,0.5279534,0.000274,-0.000412
C(Char_LSTM_Dim),87.003661,87.003661,1.0,2.69934,0.1009603,0.001851,0.001165
C(Word_LSTM_Dim),27.854516,27.854516,1.0,0.864203,0.3529723,0.000593,-9.3e-05
C(Embedding):C(Tagging_Scheme),117.904354,39.301451,3.0,1.219351,0.3019124,0.002509,0.000451
C(Embedding):C(Capitalisation_Dim),1.966871,0.655624,3.0,0.020341,0.9960584,4.2e-05,-0.002014
C(Tagging_Scheme):C(Capitalisation_Dim),80.739923,80.739923,1.0,2.505004,0.114059,0.001718,0.001032
C(Embedding):C(Char_LSTM_Dim),155.475297,51.825099,3.0,1.607904,0.1864818,0.003308,0.00125
C(Tagging_Scheme):C(Char_LSTM_Dim),8.241803,8.241803,1.0,0.255707,0.6132873,0.000175,-0.00051


In [24]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[EMBEDDING])
mc_results = mc.tukeyhsd()
print(mc_results)

 Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower    upper   reject
---------------------------------------------------
FastText  Glove   -2.1879  -3.8562  -0.5197   True 
FastText Wang2Vec  1.3598  -0.3194   3.0391  False 
FastText Word2Vec -15.2485 -16.9012 -13.5958  True 
 Glove   Wang2Vec  3.5478   1.8768   5.2188   True 
 Glove   Word2Vec -13.0605 -14.7049 -11.4162  True 
Wang2Vec Word2Vec -16.6083 -18.2638 -14.9528  True 
---------------------------------------------------


In [25]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[CAP_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
False   True   0.471   -0.9155 1.8575 False 
--------------------------------------------


In [26]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[TAGGING_SCHEME])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
 iob   iobes   0.061   -1.3257 1.4476 False 
--------------------------------------------


In [27]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[CHAR_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
  25     50   -0.7512  -2.1365 0.6342 False 
--------------------------------------------


In [28]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[WORD_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
 100    200   -0.4854  -1.8715 0.9008 False 
--------------------------------------------
