In [1]:
import json
import pandas as pd
from pathlib import Path
from pprint import pprint
import datetime
import time
from scipy import stats
import numpy as np
import matplotlib as plt

import pandas
import researchpy as rp
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp

%matplotlib inline

In [2]:
TRAINING_ROUND = 'Training_Round'
EPOCHS = 'Epochs'
SCENARIO = 'Scenario'
EMBEDDING = 'Embedding'
TAGGING_SCHEME = 'Tagging_Scheme'
CAP_DIM = 'Capitalisation_Dim'
LOWER = 'Lower'
CHAR_DIM = 'Char_LSTM_Dim'
WORD_DIM = 'Word_LSTM_Dim'
PRECISION = 'Precision'
RECALL = 'Recall'
F1_SCORE = 'F1_Score'

In [3]:
columns = [TRAINING_ROUND, EPOCHS, SCENARIO, EMBEDDING, TAGGING_SCHEME, CAP_DIM, LOWER, CHAR_DIM, WORD_DIM, PRECISION, RECALL, F1_SCORE]

In [4]:
columns

['Training_Round',
 'Epochs',
 'Scenario',
 'Embedding',
 'Tagging_Scheme',
 'Capitalisation_Dim',
 'Lower',
 'Char_LSTM_Dim',
 'Word_LSTM_Dim',
 'Precision',
 'Recall',
 'F1_Score']

In [5]:
pd.set_option('display.max_columns', 1000)
training_data_df = pd.read_csv('resultados_sem_outliers.csv')
training_data_df

Unnamed: 0.1,Unnamed: 0,Training_Round,Epochs,Scenario,Embedding,Tagging_Scheme,Capitalisation_Dim,Lower,Char_LSTM_Dim,Word_LSTM_Dim,Precision,Recall,F1_Score
0,0,0,5,selective,Wang2Vec,IOB2,False,False,25,100,58.95,54.79,56.79
1,1,0,5,selective,Wang2Vec,IOB2,False,False,25,200,61.60,52.06,56.43
2,2,0,5,selective,Wang2Vec,IOB2,False,False,50,100,66.50,47.47,55.40
3,3,0,5,selective,Wang2Vec,IOB2,False,False,50,200,58.54,50.74,54.36
4,4,0,5,selective,Wang2Vec,IOB2,False,True,25,100,66.28,67.73,67.00
5,5,0,5,selective,Wang2Vec,IOB2,False,True,25,200,71.00,62.67,66.58
6,6,0,5,selective,Wang2Vec,IOB2,False,True,50,100,70.14,65.18,67.57
7,7,0,5,selective,Wang2Vec,IOB2,False,True,50,200,66.93,63.57,65.21
8,8,0,5,selective,Wang2Vec,IOB2,True,False,25,100,58.62,57.55,58.08
9,9,0,5,selective,Wang2Vec,IOB2,True,False,25,200,64.42,50.45,56.59


In [6]:
training_data_df = training_data_df[training_data_df.Scenario == 'selective']
len(training_data_df)

1071

In [7]:
training_data_df.groupby([TAGGING_SCHEME]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Tagging_Scheme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IOBES,527.0,59.136546,4.942672,50.13,55.35,57.03,64.03,67.99
IOB2,544.0,58.837868,5.1267,50.03,54.895,56.66,64.05,68.26


In [8]:
training_data_df.groupby([EMBEDDING]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Embedding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Wang2Vec,292.0,61.176096,5.294541,50.13,56.21,58.33,66.47,68.26
FastText,290.0,60.546793,4.594724,50.69,56.46,58.395,65.16,67.05
Glove,300.0,58.657,4.039428,50.18,55.0,57.345,62.6625,64.89
Word2Vec,189.0,53.723122,1.836376,50.03,52.38,54.05,55.29,56.92


In [9]:
training_data_df.groupby([CAP_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Capitalisation_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
True,541.0,59.178521,5.051205,50.03,55.3,57.02,64.32,68.26
False,530.0,58.787132,5.019212,50.06,54.965,56.71,63.845,68.07


In [10]:
training_data_df.groupby([CHAR_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Char_LSTM_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
25,539.0,58.987514,5.062034,50.06,54.97,56.84,64.14,68.26
50,532.0,58.982124,5.015971,50.03,55.3275,56.86,64.015,68.07


In [11]:
training_data_df.groupby([WORD_DIM]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Word_LSTM_Dim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100,545.0,59.088165,5.058041,50.06,55.26,56.96,64.27,68.26
200,526.0,58.877776,5.017367,50.03,54.96,56.71,63.895,68.07


In [12]:
training_data_df.groupby([LOWER]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Lower,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
True,477.0,63.394486,4.356872,50.03,62.66,64.55,66.17,68.26
False,594.0,55.443754,1.52913,50.13,54.53,55.55,56.53,59.82


In [13]:
training_data_df.groupby([EMBEDDING, TAGGING_SCHEME, CAP_DIM, WORD_DIM, CHAR_DIM, LOWER]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count,mean,std,min,25%,50%,75%,max
Embedding,Tagging_Scheme,Capitalisation_Dim,Word_LSTM_Dim,Char_LSTM_Dim,Lower,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Wang2Vec,IOB2,False,100,50,True,10.0,66.901000,0.721641,65.44,66.4250,67.105,67.4925,67.69
Wang2Vec,IOB2,True,100,25,True,9.0,66.890000,1.152801,64.87,66.2000,66.920,67.8600,68.26
Wang2Vec,IOB2,False,200,50,True,6.0,66.818333,1.050950,65.21,66.2075,67.155,67.3600,68.07
Wang2Vec,IOBES,True,200,25,True,10.0,66.788000,0.771432,65.54,66.2750,66.845,67.2000,67.99
Wang2Vec,IOB2,True,100,50,True,10.0,66.668000,0.790974,65.00,66.3300,66.755,67.2925,67.64
Wang2Vec,IOB2,True,200,50,True,9.0,66.595556,0.441902,65.94,66.2800,66.650,66.8200,67.22
Wang2Vec,IOBES,True,100,25,True,10.0,66.550000,1.105039,63.99,66.2625,66.440,67.2775,67.87
Wang2Vec,IOBES,True,100,50,True,10.0,66.446000,1.103260,63.63,66.3150,66.630,66.7175,67.79
Wang2Vec,IOBES,False,100,50,True,7.0,66.371429,0.869127,64.95,65.8950,66.650,66.8150,67.58
Wang2Vec,IOBES,True,200,50,True,10.0,66.328000,0.419571,65.60,66.2125,66.460,66.6125,66.78


In [14]:
training_data_df.groupby([TRAINING_ROUND]).describe()[F1_SCORE].sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Training_Round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,103.0,59.312524,5.067188,50.09,55.105,56.93,64.475,67.79
2,110.0,59.282727,5.085964,50.31,55.2975,56.835,64.4375,68.07
5,100.0,59.2507,5.076504,50.19,55.2225,56.975,64.3025,67.86
6,107.0,59.241028,4.861291,50.13,55.585,57.16,63.97,67.74
1,108.0,58.971944,5.177517,50.34,54.8925,56.8,63.8425,67.99
9,109.0,58.924404,4.925725,50.47,55.43,56.8,63.69,67.87
8,108.0,58.760185,5.18975,50.06,54.6525,56.555,64.1425,67.73
0,114.0,58.75114,5.055364,50.03,54.98,56.735,63.5525,67.57
7,103.0,58.717282,5.023926,50.13,54.875,56.84,63.475,67.53
4,109.0,58.672202,5.048796,50.16,54.91,56.8,63.85,68.26


In [15]:
rp.summary_cont(training_data_df[F1_SCORE])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,F1_Score,1071.0,58.984837,5.036851,0.153909,58.682839,59.286835


In [16]:
rp.summary_cont(training_data_df.groupby(EMBEDDING))[F1_SCORE]





Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
Embedding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FastText,290,60.546793,4.594724,0.269811,60.017963,61.075624
Glove,300,58.657,4.039428,0.233216,58.199896,59.114104
Wang2Vec,292,61.176096,5.294541,0.30984,60.56881,61.783381
Word2Vec,189,53.723122,1.836376,0.133577,53.461311,53.984932


In [17]:
print(F1_SCORE, EMBEDDING, TAGGING_SCHEME, CAP_DIM, CHAR_DIM, WORD_DIM, LOWER)

F1_Score Embedding Tagging_Scheme Capitalisation_Dim Char_LSTM_Dim Word_LSTM_Dim Lower


In [18]:
# Fits the model with the interaction term
# This will also automatically include the main effects for each factor
model = ols('F1_Score ~ C(Embedding)*C(Tagging_Scheme)*C(Capitalisation_Dim)*C(Char_LSTM_Dim)*C(Word_LSTM_Dim)*C(Lower)', training_data_df).fit()

# Seeing if the overall model is significant
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .20f}")

Overall model F( 126, 944) =  88.612, p =  0.00000000000000000000


In [30]:
model.mse_model, model.mse_resid, model.mse_total, model.f_pvalue

(198.64709406727505, 2.241765989331115, 25.369870043369374, 0.0)

In [20]:
model.summary()

0,1,2,3
Dep. Variable:,F1_Score,R-squared:,0.922
Model:,OLS,Adj. R-squared:,0.912
Method:,Least Squares,F-statistic:,88.61
Date:,"Tue, 09 Apr 2019",Prob (F-statistic):,0.0
Time:,23:10:12,Log-Likelihood:,-1884.4
No. Observations:,1071,AIC:,4023.0
Df Residuals:,944,BIC:,4655.0
Df Model:,126,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,56.7950,0.473,119.954,0.000,55.866,57.724
C(Embedding)[T.Glove],-1.1490,0.670,-1.716,0.086,-2.463,0.165
C(Embedding)[T.Wang2Vec],-1.1730,0.670,-1.752,0.080,-2.487,0.141
C(Embedding)[T.Word2Vec],-2.2372,0.688,-3.252,0.001,-3.587,-0.887
C(Tagging_Scheme)[T.IOBES],-0.8836,0.738,-1.197,0.231,-2.332,0.564
C(Capitalisation_Dim)[T.True],-0.5390,0.670,-0.805,0.421,-1.853,0.775
C(Char_LSTM_Dim)[T.50],-0.4817,0.688,-0.700,0.484,-1.832,0.868
C(Word_LSTM_Dim)[T.200],-1.3887,0.710,-1.955,0.051,-2.783,0.005
C(Lower)[T.True],8.4640,0.670,12.641,0.000,7.150,9.778

0,1,2,3
Omnibus:,743.591,Durbin-Watson:,2.045
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17390.618
Skew:,-2.863,Prob(JB):,0.0
Kurtosis:,21.892,Cond. No.,1.34e+16


In [21]:
res = sm.stats.anova_lm(model, typ= 2)



In [22]:
res

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Embedding),3588.131294,3.0,533.527482,1.059626e-202
C(Tagging_Scheme),5.795578,1.0,2.585274,1.081962e-01
C(Capitalisation_Dim),7.771892,1.0,3.466862,6.292056e-02
C(Char_LSTM_Dim),2.359806,1.0,1.052655,3.051592e-01
C(Word_LSTM_Dim),26.735871,1.0,11.926254,5.780777e-04
C(Lower),13126.350994,1.0,5855.361825,0.000000e+00
C(Embedding):C(Tagging_Scheme),8.064801,3.0,1.199174,3.089513e-01
C(Embedding):C(Capitalisation_Dim),10.898361,3.0,1.620502,1.829920e-01
C(Tagging_Scheme):C(Capitalisation_Dim),1.162554,1.0,0.518588,4.716228e-01
C(Embedding):C(Char_LSTM_Dim),19.829510,3.0,2.948495,3.194173e-02


In [23]:
# Calculating effect size
def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq']/aov[:]['df']
    
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*aov['mean_sq'][-1]))/(sum(aov['sum_sq'])+aov['mean_sq'][-1])
    
    cols = ['sum_sq', 'mean_sq', 'df', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov

anova_table(res)

Unnamed: 0,sum_sq,mean_sq,df,F,PR(>F),eta_sq,omega_sq
C(Embedding),3588.131294,1196.043765,3.0,533.527482,1.059626e-202,0.155234,0.154928
C(Tagging_Scheme),5.795578,5.795578,1.0,2.585274,1.081962e-01,0.000251,0.000154
C(Capitalisation_Dim),7.771892,7.771892,1.0,3.466862,6.292056e-02,0.000336,0.000239
C(Char_LSTM_Dim),2.359806,2.359806,1.0,1.052655,3.051592e-01,0.000102,0.000005
C(Word_LSTM_Dim),26.735871,26.735871,1.0,11.926254,5.780777e-04,0.001157,0.001060
C(Lower),13126.350994,13126.350994,1.0,5855.361825,0.000000e+00,0.567888,0.567736
C(Embedding):C(Tagging_Scheme),8.064801,2.688267,3.0,1.199174,3.089513e-01,0.000349,0.000058
C(Embedding):C(Capitalisation_Dim),10.898361,3.632787,3.0,1.620502,1.829920e-01,0.000471,0.000181
C(Tagging_Scheme):C(Capitalisation_Dim),1.162554,1.162554,1.0,0.518588,4.716228e-01,0.000050,-0.000047
C(Embedding):C(Char_LSTM_Dim),19.829510,6.609837,3.0,2.948495,3.194173e-02,0.000858,0.000567


In [24]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[EMBEDDING])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
 group1   group2  meandiff  lower   upper  reject
-------------------------------------------------
FastText  Glove   -1.8898  -2.8019 -0.9777  True 
FastText Wang2Vec  0.6293   -0.289  1.5476 False 
FastText Word2Vec -6.8237  -7.8591 -5.7882  True 
 Glove   Wang2Vec  2.5191   1.6085  3.4296  True 
 Glove   Word2Vec -4.9339  -5.9625 -3.9053  True 
Wang2Vec Word2Vec  -7.453   -8.487 -6.4189  True 
-------------------------------------------------


In [25]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[CAP_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
False   True   0.3914  -0.2125 0.9952 False 
--------------------------------------------


In [26]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[LOWER])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff lower  upper  reject
-------------------------------------------
False   True   7.9507  7.5741 8.3274  True 
-------------------------------------------


In [27]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[TAGGING_SCHEME])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
 IOB2  IOBES   0.2987  -0.3054 0.9028 False 
--------------------------------------------


In [28]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[CHAR_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
  25     50   -0.0054  -0.6097 0.5989 False 
--------------------------------------------


In [29]:
mc = statsmodels.stats.multicomp.MultiComparison(training_data_df[F1_SCORE], training_data_df[WORD_DIM])
mc_results = mc.tukeyhsd()
print(mc_results)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
 100    200   -0.2104  -0.8146 0.3939 False 
--------------------------------------------
