# Capstone 1 Statistical Data Analysis

### Recode the data set to prep for this round

In [1]:
#import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

In [2]:
#import in data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00272/SkillCraft1_Dataset.csv'

base = pd.read_csv(url)

In [3]:
#change the columns with numeric values to numeric (specifically Age,
# HoursPerWeek, and TotalHours)
base['Age']=pd.to_numeric(base['Age'], errors = 'coerce')
base['HoursPerWeek']=pd.to_numeric(base['HoursPerWeek'], errors = 'coerce')
base['TotalHours']=pd.to_numeric(base['TotalHours'], errors = 'coerce')

In [4]:
#clean only outliers, keeping the NaNs of the pro subset
base2 = base.set_index('TotalHours')
base2 = base2.drop(1000000, axis=0)
base2 = base2.reset_index()
#remove Gap Betweeb PACs outlier
base2 = base2.set_index('GapBetweenPACs')
base2 = base2.drop(237.142900, axis=0)
base2 = base2.reset_index()

## Begin systematically working through the data set using ANVOA, comparing the pros to the rest of the data set on a per variable basis

#### Pros vs. All: APM

In [5]:
f_APM, p_APM = stats.f_oneway(base2[base2['LeagueIndex']==1].APM,
                              base2[base2['LeagueIndex']==2].APM,
                              base2[base2['LeagueIndex']==3].APM,
                              base2[base2['LeagueIndex']==4].APM,
                              base2[base2['LeagueIndex']==5].APM,
                              base2[base2['LeagueIndex']==6].APM,
                              base2[base2['LeagueIndex']==7].APM,
                              base2[base2['LeagueIndex']==8].APM)
f_APM, p_APM

(438.7621400179222, 0.0)

since it is significant, will perform tukey range test

In [6]:
mc_APM = MultiComparison(base2['APM'],base2['LeagueIndex'])
result_APM = mc_APM.tukeyhsd()

print(result_APM)
print(mc_APM.groupsunique)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj  lower    upper   reject
-----------------------------------------------------
     1      2  15.1078 0.001   4.3482  25.8673   True
     1      3  30.2981 0.001  20.2078  40.3884   True
     1      4   46.174 0.001  36.4614  55.8867   True
     1      5  71.6613 0.001  61.9425  81.3801   True
     1      6  99.0101 0.001  89.0481  108.972   True
     1      7 129.8825 0.001 108.6762 151.0888   True
     1      8 207.6681 0.001 189.9296 225.4065   True
     2      3  15.1903 0.001   7.3822  22.9985   True
     2      4  31.0662 0.001  23.7526  38.3799   True
     2      5  56.5536 0.001  49.2318  63.8754   True
     2      6  83.9023 0.001  76.2607  91.5439   True
     2      7 114.7748 0.001  94.5545 134.9951   True
     2      8 192.5603 0.001 176.0132 209.1074   True
     3      4  15.8759 0.001   9.5882  22.1636   True
     3      5  41.3632 0.001  35.0661  47.6604   True
     3      6   68.712 0.001

APM is different across all of the leagues at a significance of 0.05

#### Pros vs. All: SelectByHotkeys

In [7]:
f_SBH, p_SBH = stats.f_oneway(base2[base2['LeagueIndex']==1].SelectByHotkeys,
                              base2[base2['LeagueIndex']==2].SelectByHotkeys,
                              base2[base2['LeagueIndex']==3].SelectByHotkeys,
                              base2[base2['LeagueIndex']==4].SelectByHotkeys,
                              base2[base2['LeagueIndex']==5].SelectByHotkeys,
                              base2[base2['LeagueIndex']==6].SelectByHotkeys,
                              base2[base2['LeagueIndex']==7].SelectByHotkeys,
                              base2[base2['LeagueIndex']==8].SelectByHotkeys)
f_SBH, p_SBH

(231.7572826491868, 3.3676138216433242e-282)

Small p-value, do Tukey

In [8]:
mc_SBH = MultiComparison(base2['SelectByHotkeys'],base2['LeagueIndex'])
result_SBH = mc_SBH.tukeyhsd()

print(result_SBH)
print(mc_SBH.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2   0.0004    0.9 -0.0008 0.0017  False
     1      3   0.0011 0.0801 -0.0001 0.0023  False
     1      4   0.0021  0.001  0.0009 0.0032   True
     1      5   0.0039  0.001  0.0027  0.005   True
     1      6   0.0064  0.001  0.0052 0.0075   True
     1      7   0.0083  0.001  0.0059 0.0108   True
     1      8   0.0199  0.001  0.0178 0.0219   True
     2      3   0.0007 0.3562 -0.0002 0.0016  False
     2      4   0.0016  0.001  0.0008 0.0025   True
     2      5   0.0034  0.001  0.0026 0.0043   True
     2      6   0.0059  0.001   0.005 0.0068   True
     2      7   0.0079  0.001  0.0055 0.0102   True
     2      8   0.0194  0.001  0.0175 0.0214   True
     3      4    0.001 0.0015  0.0002 0.0017   True
     3      5   0.0028  0.001   0.002 0.0035   True
     3      6   0.0052  0.001  0.0045  0.006   True
     3      

In Select By Hotkeys, there are some cases where the mean does not significantly change between adjascent groups (Bronze-Silver, Bronze- Gold, Silver-Gold, and Master-Grandmaster), as well as one pair jump (Bronze-Gold).

#### Pros vs. All: AssignToHotkeys

In [9]:
f_ATH, p_ATH = stats.f_oneway(base2[base2['LeagueIndex']==1].AssignToHotkeys,
                              base2[base2['LeagueIndex']==2].AssignToHotkeys,
                              base2[base2['LeagueIndex']==3].AssignToHotkeys,
                              base2[base2['LeagueIndex']==4].AssignToHotkeys,
                              base2[base2['LeagueIndex']==5].AssignToHotkeys,
                              base2[base2['LeagueIndex']==6].AssignToHotkeys,
                              base2[base2['LeagueIndex']==7].AssignToHotkeys,
                              base2[base2['LeagueIndex']==8].AssignToHotkeys)
f_ATH, p_ATH

(223.88940721002388, 4.279421546449828e-274)

Continue to Tukeys

In [10]:
mc_ATH = MultiComparison(base2['AssignToHotkeys'],base2['LeagueIndex'])
result_ATH = mc_ATH.tukeyhsd()

print(result_ATH)
print(mc_ATH.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
     1      2      0.0 0.4646   -0.0 0.0001  False
     1      3   0.0001  0.001    0.0 0.0001   True
     1      4   0.0002  0.001 0.0001 0.0002   True
     1      5   0.0002  0.001 0.0002 0.0003   True
     1      6   0.0003  0.001 0.0003 0.0004   True
     1      7   0.0005  0.001 0.0004 0.0006   True
     1      8   0.0008  0.001 0.0007 0.0008   True
     2      3   0.0001  0.001    0.0 0.0001   True
     2      4   0.0001  0.001 0.0001 0.0002   True
     2      5   0.0002  0.001 0.0002 0.0002   True
     2      6   0.0003  0.001 0.0003 0.0003   True
     2      7   0.0005  0.001 0.0004 0.0006   True
     2      8   0.0007  0.001 0.0006 0.0008   True
     3      4   0.0001  0.001    0.0 0.0001   True
     3      5   0.0001  0.001 0.0001 0.0002   True
     3      6   0.0002  0.001 0.0002 0.0003   True
     3      7   0.0004  0.001 

Only keep of the null hypothesis, between Bronze and Silver

#### Pros vs. All: UniqueHotkeys

In [11]:
f_UH, p_UH = stats.f_oneway(base2[base2['LeagueIndex']==1].UniqueHotkeys,
                              base2[base2['LeagueIndex']==2].UniqueHotkeys,
                              base2[base2['LeagueIndex']==3].UniqueHotkeys,
                              base2[base2['LeagueIndex']==4].UniqueHotkeys,
                              base2[base2['LeagueIndex']==5].UniqueHotkeys,
                              base2[base2['LeagueIndex']==6].UniqueHotkeys,
                              base2[base2['LeagueIndex']==7].UniqueHotkeys,
                              base2[base2['LeagueIndex']==8].UniqueHotkeys)
f_UH, p_UH

(75.81603914894723, 2.1424126074003407e-102)

Move to Tukey

In [12]:
mc_UH = MultiComparison(base2['UniqueHotkeys'],base2['LeagueIndex'])
result_UH = mc_UH.tukeyhsd()

print(result_UH)
print(mc_UH.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2   0.1166    0.9  -0.512 0.7453  False
     1      3   0.4522 0.2794 -0.1373 1.0417  False
     1      4   0.7367 0.0021  0.1693 1.3041   True
     1      5   1.4644  0.001  0.8966 2.0322   True
     1      6   2.2868  0.001  1.7048 2.8688   True
     1      7   3.5365  0.001  2.2976 4.7754   True
     1      8   4.1105  0.001  3.0742 5.1468   True
     2      3   0.3356 0.3333 -0.1206 0.7918  False
     2      4   0.6201  0.001  0.1928 1.0473   True
     2      5   1.3478  0.001    0.92 1.7756   True
     2      6   2.1702  0.001  1.7237 2.6166   True
     2      7   3.4198  0.001  2.2385 4.6012   True
     2      8   3.9939  0.001  3.0271 4.9606   True
     3      4   0.2845 0.2676 -0.0829 0.6518  False
     3      5   1.0122  0.001  0.6443 1.3801   True
     3      6   1.8346  0.001  1.4451  2.224   True
     3      

Keep the null on: Bronze-Silver, Bronze-Gold, Silver-Gold, Gold-Platinum, and GM-Pro

#### Pros vs All: MinimapAttacks

In [13]:
f_MA, p_MA = stats.f_oneway(base2[base2['LeagueIndex']==1].MinimapAttacks,
                              base2[base2['LeagueIndex']==2].MinimapAttacks,
                              base2[base2['LeagueIndex']==3].MinimapAttacks,
                              base2[base2['LeagueIndex']==4].MinimapAttacks,
                              base2[base2['LeagueIndex']==5].MinimapAttacks,
                              base2[base2['LeagueIndex']==6].MinimapAttacks,
                              base2[base2['LeagueIndex']==7].MinimapAttacks,
                              base2[base2['LeagueIndex']==8].MinimapAttacks)
f_MA, p_MA

(70.07493831956016, 6.915349471701886e-95)

Largest p value so far (still very small)

In [14]:
mc_MA = MultiComparison(base2['MinimapAttacks'],base2['LeagueIndex'])
result_MA = mc_MA.tukeyhsd()

print(result_MA)
print(mc_MA.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2      0.0    0.9    -0.0 0.0001  False
     1      3      0.0 0.4787    -0.0 0.0001  False
     1      4      0.0  0.011     0.0 0.0001   True
     1      5   0.0001  0.001     0.0 0.0001   True
     1      6   0.0001  0.001  0.0001 0.0002   True
     1      7   0.0003  0.001  0.0002 0.0004   True
     1      8   0.0003  0.001  0.0003 0.0004   True
     2      3      0.0    0.9    -0.0    0.0  False
     2      4      0.0 0.0501    -0.0 0.0001  False
     2      5   0.0001  0.001     0.0 0.0001   True
     2      6   0.0001  0.001  0.0001 0.0001   True
     2      7   0.0003  0.001  0.0002 0.0004   True
     2      8   0.0003  0.001  0.0003 0.0004   True
     3      4      0.0 0.3585    -0.0    0.0  False
     3      5   0.0001  0.001     0.0 0.0001   True
     3      6   0.0001  0.001  0.0001 0.0001   True
     3      

Keep the null hypothesis on: Bronze-Silver, Bronze-Gold, Silver-Gold, Silver-Platinum, Gold-Platinum and GM-Pro

#### Pros vs All: MinimapRightClicks

In [15]:
f_MRC, p_MRC = stats.f_oneway(base2[base2['LeagueIndex']==1].MinimapRightClicks,
                              base2[base2['LeagueIndex']==2].MinimapRightClicks,
                              base2[base2['LeagueIndex']==3].MinimapRightClicks,
                              base2[base2['LeagueIndex']==4].MinimapRightClicks,
                              base2[base2['LeagueIndex']==5].MinimapRightClicks,
                              base2[base2['LeagueIndex']==6].MinimapRightClicks,
                              base2[base2['LeagueIndex']==7].MinimapRightClicks,
                              base2[base2['LeagueIndex']==8].MinimapRightClicks)
f_MRC, p_MRC

(31.51732206476627, 1.3080364877901262e-42)

Very 'large', I'm expecting a lot of null hypothesis keeps

In [16]:
mc_MRC = MultiComparison(base2['MinimapRightClicks'],base2['LeagueIndex'])
result_MRC = mc_MRC.tukeyhsd()

print(result_MRC)
print(mc_MRC.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2   0.0001 0.4983    -0.0 0.0002  False
     1      3   0.0001 0.0031     0.0 0.0002   True
     1      4   0.0002  0.001  0.0001 0.0003   True
     1      5   0.0002  0.001  0.0001 0.0003   True
     1      6   0.0003  0.001  0.0002 0.0004   True
     1      7   0.0004  0.001  0.0002 0.0006   True
     1      8   0.0006  0.001  0.0005 0.0008   True
     2      3   0.0001 0.3227    -0.0 0.0001  False
     2      4   0.0001 0.0021     0.0 0.0002   True
     2      5   0.0002  0.001  0.0001 0.0002   True
     2      6   0.0002  0.001  0.0001 0.0003   True
     2      7   0.0003  0.001  0.0001 0.0005   True
     2      8   0.0006  0.001  0.0004 0.0007   True
     3      4      0.0 0.6108    -0.0 0.0001  False
     3      5   0.0001  0.001     0.0 0.0002   True
     3      6   0.0001  0.001  0.0001 0.0002   True
     3      

Keep the null hypothesis for: Bronze-Silver, Silver-Gold, Gold-Platnium, Diamond-Master, Diamond-GM, Master-GM

#### Pros vs. All: NumberOfPACs

In [17]:
f_NOP, p_NOP = stats.f_oneway(base2[base2['LeagueIndex']==1].NumberOfPACs,
                              base2[base2['LeagueIndex']==2].NumberOfPACs,
                              base2[base2['LeagueIndex']==3].NumberOfPACs,
                              base2[base2['LeagueIndex']==4].NumberOfPACs,
                              base2[base2['LeagueIndex']==5].NumberOfPACs,
                              base2[base2['LeagueIndex']==6].NumberOfPACs,
                              base2[base2['LeagueIndex']==7].NumberOfPACs,
                              base2[base2['LeagueIndex']==8].NumberOfPACs)
f_NOP, p_NOP

(299.4261346149908, 0.0)

Lots of signficance here

In [18]:
mc_NOP = MultiComparison(base2['NumberOfPACs'],base2['LeagueIndex'])
result_NOP = mc_NOP.tukeyhsd()

print(result_NOP)
print(mc_NOP.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
     1      2   0.0003 0.004  0.0001 0.0005   True
     1      3   0.0006 0.001  0.0004 0.0008   True
     1      4   0.0009 0.001  0.0007 0.0011   True
     1      5   0.0014 0.001  0.0012 0.0016   True
     1      6   0.0019 0.001  0.0017 0.0021   True
     1      7   0.0027 0.001  0.0023 0.0032   True
     1      8   0.0029 0.001  0.0025 0.0032   True
     2      3   0.0003 0.001  0.0002 0.0005   True
     2      4   0.0007 0.001  0.0005 0.0008   True
     2      5   0.0011 0.001  0.0009 0.0012   True
     2      6   0.0016 0.001  0.0014 0.0017   True
     2      7   0.0025 0.001   0.002 0.0029   True
     2      8   0.0026 0.001  0.0022 0.0029   True
     3      4   0.0003 0.001  0.0002 0.0005   True
     3      5   0.0008 0.001  0.0006 0.0009   True
     3      6   0.0013 0.001  0.0011 0.0014   True
     3      7   0.0021 0.001  

Keep the null hypothesis only between GM and Pro

#### Pros vs All: GapBetweenPACs

In [19]:
f_GBP, p_GBP = stats.f_oneway(base2[base2['LeagueIndex']==1].GapBetweenPACs,
                              base2[base2['LeagueIndex']==2].GapBetweenPACs,
                              base2[base2['LeagueIndex']==3].GapBetweenPACs,
                              base2[base2['LeagueIndex']==4].GapBetweenPACs,
                              base2[base2['LeagueIndex']==5].GapBetweenPACs,
                              base2[base2['LeagueIndex']==6].GapBetweenPACs,
                              base2[base2['LeagueIndex']==7].GapBetweenPACs,
                              base2[base2['LeagueIndex']==8].GapBetweenPACs)
f_GBP, p_GBP

(224.57134884198777, 8.422650923558125e-275)

Onto Tukey

In [20]:
mc_GBP = MultiComparison(base2['GapBetweenPACs'],base2['LeagueIndex'])
result_GBP = mc_GBP.tukeyhsd()

print(result_GBP)
print(mc_GBP.groupsunique)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
group1 group2 meandiff p-adj   lower    upper   reject
------------------------------------------------------
     1      2  -10.833  0.001 -14.8155  -6.8505   True
     1      3  -18.478  0.001 -22.2128 -14.7433   True
     1      4 -23.5805  0.001 -27.1755 -19.9855   True
     1      5 -29.8872  0.001 -33.4845   -26.29   True
     1      6  -34.414  0.001 -38.1013 -30.7268   True
     1      7 -41.6438  0.001  -49.493 -33.7946   True
     1      8 -45.6522  0.001 -52.2178 -39.0865   True
     2      3   -7.645  0.001 -10.5351  -4.7549   True
     2      4 -12.7475  0.001 -15.4545 -10.0404   True
     2      5 -19.0542  0.001 -21.7643 -16.3441   True
     2      6  -23.581  0.001 -26.4094 -20.7526   True
     2      7 -30.8107  0.001  -38.295 -23.3265   True
     2      8 -34.8191  0.001 -40.9438 -28.6944   True
     3      4  -5.1025  0.001  -7.4297  -2.7752   True
     3      5 -11.4092  0.001   -13.74  -9.0784   True
     3    

Keep the null hypothesis on: Msater-GM, GM-Pro

#### Pros vs. All: ActionLatency

In [21]:
f_AL, p_AL = stats.f_oneway(base2[base2['LeagueIndex']==1].ActionLatency,
                              base2[base2['LeagueIndex']==2].ActionLatency,
                              base2[base2['LeagueIndex']==3].ActionLatency,
                              base2[base2['LeagueIndex']==4].ActionLatency,
                              base2[base2['LeagueIndex']==5].ActionLatency,
                              base2[base2['LeagueIndex']==6].ActionLatency,
                              base2[base2['LeagueIndex']==7].ActionLatency,
                              base2[base2['LeagueIndex']==8].ActionLatency)
f_AL, p_AL

(408.8008889057102, 0.0)

Will reject most nulls

In [22]:
mc_AL = MultiComparison(base2['ActionLatency'],base2['LeagueIndex'])
result_AL = mc_AL.tukeyhsd()

print(result_AL)
print(mc_AL.groupsunique)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
group1 group2 meandiff p-adj   lower    upper   reject
------------------------------------------------------
     1      2 -13.7359  0.001 -17.7779  -9.6939   True
     1      3 -21.3087  0.001 -25.0993 -17.5181   True
     1      4 -30.2122  0.001 -33.8609 -26.5635   True
     1      5 -38.8962  0.001 -42.5472 -35.2452   True
     1      6 -46.0582  0.001 -49.8005 -42.3158   True
     1      7 -54.6631  0.001 -62.6296 -46.6966   True
     1      8 -59.6158  0.001 -66.2796 -52.9521   True
     2      3  -7.5728  0.001  -10.506  -4.6395   True
     2      4 -16.4763  0.001 -19.2238 -13.7288   True
     2      5 -25.1603  0.001 -27.9108 -22.4097   True
     2      6 -32.3222  0.001 -35.1929 -29.4515   True
     2      7 -40.9271  0.001 -48.5232 -33.3311   True
     2      8 -45.8799  0.001 -52.0961 -39.6637   True
     3      4  -8.9035  0.001 -11.2656  -6.5415   True
     3      5 -17.5875  0.001 -19.9531 -15.2219   True
     3    

Only keeps the null on GM-Pro (common kept null)

#### Pros vs. All: ActionsInPAC

In [23]:
f_AIP, p_AIP = stats.f_oneway(base2[base2['LeagueIndex']==1].ActionsInPAC,
                              base2[base2['LeagueIndex']==2].ActionsInPAC,
                              base2[base2['LeagueIndex']==3].ActionsInPAC,
                              base2[base2['LeagueIndex']==4].ActionsInPAC,
                              base2[base2['LeagueIndex']==5].ActionsInPAC,
                              base2[base2['LeagueIndex']==6].ActionsInPAC,
                              base2[base2['LeagueIndex']==7].ActionsInPAC,
                              base2[base2['LeagueIndex']==8].ActionsInPAC)
f_AIP, p_AIP

(12.42927863873358, 8.30207214228753e-16)

really big probability, I'm expecting a lot of kept null hypoetheses

In [24]:
mc_AIP = MultiComparison(base2['ActionsInPAC'],base2['LeagueIndex'])
result_AIP = mc_AIP.tukeyhsd()

print(result_AIP)
print(mc_AIP.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2    0.479 0.0139  0.0562 0.9018   True
     1      3   0.6363  0.001  0.2399 1.0328   True
     1      4     0.71  0.001  0.3284 1.0917   True
     1      5   0.9406  0.001  0.5587 1.3225   True
     1      6   0.9486  0.001  0.5572   1.34   True
     1      7   0.6991 0.1769 -0.1341 1.5324  False
     1      8   1.1498  0.001  0.4528 1.8468   True
     2      3   0.1573 0.7496 -0.1495 0.4641  False
     2      4    0.231 0.2233 -0.0563 0.5184  False
     2      5   0.4616  0.001  0.1739 0.7493   True
     2      6   0.4696  0.001  0.1694 0.7699   True
     2      7   0.2201    0.9 -0.5744 1.0146  False
     2      8   0.6708 0.0376  0.0206 1.3209   True
     3      4   0.0737    0.9 -0.1734 0.3207  False
     3      5   0.3043 0.0048  0.0568 0.5517   True
     3      6   0.3123 0.0073  0.0503 0.5742   True
     3      

Rejected the null hypoethesis on: Bronze-GM, Silver-Gold, Silver-Platinum, Silver-GM, Gold-Platinum, Gold-GM, Gold-Pro, any combination of Platinum and higher. Basically, unless you are trying to tell apart big jumps, ActionsInPAC are not helpful

#### Pros vs All: TotalMapExplored

In [25]:
f_TME, p_TME = stats.f_oneway(base2[base2['LeagueIndex']==1].TotalMapExplored,
                              base2[base2['LeagueIndex']==2].TotalMapExplored,
                              base2[base2['LeagueIndex']==3].TotalMapExplored,
                              base2[base2['LeagueIndex']==4].TotalMapExplored,
                              base2[base2['LeagueIndex']==5].TotalMapExplored,
                              base2[base2['LeagueIndex']==6].TotalMapExplored,
                              base2[base2['LeagueIndex']==7].TotalMapExplored,
                              base2[base2['LeagueIndex']==8].TotalMapExplored)
f_TME, p_TME

(27.62398096583538, 3.6314759706155436e-37)

Should be a fair number of keeps

In [26]:
mc_TME = MultiComparison(base2['TotalMapExplored'],base2['LeagueIndex'])
result_TME = mc_TME.tukeyhsd()

print(result_TME)
print(mc_TME.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     1      2   1.0893 0.7258 -0.9819  3.1604  False
     1      3   1.7714 0.1041 -0.1709  3.7137  False
     1      4   3.2848  0.001  1.4152  5.1544   True
     1      5   4.3334  0.001  2.4626  6.2042   True
     1      6   5.5726  0.001   3.655  7.4902   True
     1      7   9.2936  0.001  5.2116 13.3757   True
     1      8   4.1196 0.0063  0.7051  7.5341   True
     2      3   0.6821 0.8564 -0.8209  2.1851  False
     2      4   2.1955  0.001  0.7877  3.6033   True
     2      5   3.2441  0.001  1.8347  4.6535   True
     2      6   4.4834  0.001  3.0124  5.9543   True
     2      7   8.2044  0.001  4.3121 12.0966   True
     2      8   3.0303 0.0762 -0.1548  6.2155  False
     3      4   1.5134 0.0038  0.3031  2.7237   True
     3      5    2.562  0.001  1.3498  3.7741   True
     3      6   3.8012  0.001   2.518  5.0844 

Keep null on: Bronze-Silver, Bronze-Gold, Silver-Gold, Silver-Pro, Gold-Pro, Platinum-Diamond, Platinum-Pro, Diamond-Pro, Master-GM, Master-Pro

#### Pros vs. All: WorkersMade

In [27]:
f_WM, p_WM = stats.f_oneway(base2[base2['LeagueIndex']==1].WorkersMade,
                              base2[base2['LeagueIndex']==2].WorkersMade,
                              base2[base2['LeagueIndex']==3].WorkersMade,
                              base2[base2['LeagueIndex']==4].WorkersMade,
                              base2[base2['LeagueIndex']==5].WorkersMade,
                              base2[base2['LeagueIndex']==6].WorkersMade,
                              base2[base2['LeagueIndex']==7].WorkersMade,
                              base2[base2['LeagueIndex']==8].WorkersMade)
f_WM, p_WM

(52.474324635241764, 2.037604077119934e-71)

Middle of the road ish, will probably reject quite a few

In [28]:
mc_WM = MultiComparison(base2['WorkersMade'],base2['LeagueIndex'])
result_WM = mc_WM.tukeyhsd()

print(result_WM)
print(mc_WM.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2   0.0002 0.0058     0.0 0.0003   True
     1      3   0.0003  0.001  0.0002 0.0004   True
     1      4   0.0004  0.001  0.0002 0.0005   True
     1      5   0.0005  0.001  0.0004 0.0007   True
     1      6   0.0006  0.001  0.0005 0.0007   True
     1      7   0.0006  0.001  0.0003 0.0009   True
     1      8   0.0004  0.001  0.0002 0.0007   True
     2      3   0.0001 0.0119     0.0 0.0002   True
     2      4   0.0002  0.001  0.0001 0.0003   True
     2      5   0.0004  0.001  0.0003 0.0005   True
     2      6   0.0004  0.001  0.0003 0.0005   True
     2      7   0.0004  0.001  0.0002 0.0007   True
     2      8   0.0003  0.003  0.0001 0.0005   True
     3      4   0.0001 0.0294     0.0 0.0002   True
     3      5   0.0002  0.001  0.0002 0.0003   True
     3      6   0.0003  0.001  0.0002 0.0004   True
     3      

Keep null on: Gold-Pro, any combination of Platinum and higher

#### Pro vs. All: UniqueUnitsMade

In [29]:
f_UUM, p_UUM = stats.f_oneway(base2[base2['LeagueIndex']==1].UniqueUnitsMade,
                              base2[base2['LeagueIndex']==2].UniqueUnitsMade,
                              base2[base2['LeagueIndex']==3].UniqueUnitsMade,
                              base2[base2['LeagueIndex']==4].UniqueUnitsMade,
                              base2[base2['LeagueIndex']==5].UniqueUnitsMade,
                              base2[base2['LeagueIndex']==6].UniqueUnitsMade,
                              base2[base2['LeagueIndex']==7].UniqueUnitsMade,
                              base2[base2['LeagueIndex']==8].UniqueUnitsMade)
f_UUM, p_UUM

(11.742727910512512, 7.561419617136392e-15)

large probability, lots of null keeps

In [30]:
mc_UUM = MultiComparison(base2['UniqueUnitsMade'],base2['LeagueIndex'])
result_UUM = mc_UUM.tukeyhsd()

print(result_UUM)
print(mc_UUM.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2   0.2255    0.9    -0.3 0.7511  False
     1      3   0.5195 0.0305  0.0266 1.0124   True
     1      4    0.724  0.001  0.2495 1.1984   True
     1      5   0.8512  0.001  0.3765  1.326   True
     1      6   1.0202  0.001  0.5336 1.5068   True
     1      7   1.1792 0.0131  0.1433  2.215   True
     1      8   0.2779    0.9 -0.5886 1.1443  False
     2      3    0.294 0.2735 -0.0874 0.6754  False
     2      4   0.4984  0.001  0.1412 0.8557   True
     2      5   0.6257  0.001   0.268 0.9833   True
     2      6   0.7946  0.001  0.4214 1.1679   True
     2      7   0.9536 0.0676 -0.0341 1.9413  False
     2      8   0.0523    0.9 -0.7559 0.8606  False
     3      4   0.2045 0.4708 -0.1027 0.5116  False
     3      5   0.3317  0.024  0.0241 0.6393   True
     3      6   0.5007  0.001   0.175 0.8263   True
     3      

Keep null on: Bronze-Silver, Bronze-Pro, Silver-Gold, Silver-GM, Silver-Pro, Gold-Platinum, Gold-GM, Gold-Pro, all combinations from Platinum and higher

#### Pros vs. All: ComplexUnitsMade

In [31]:
f_CoUM, p_CoUM = stats.f_oneway(base2[base2['LeagueIndex']==1].ComplexUnitsMade,
                              base2[base2['LeagueIndex']==2].ComplexUnitsMade,
                              base2[base2['LeagueIndex']==3].ComplexUnitsMade,
                              base2[base2['LeagueIndex']==4].ComplexUnitsMade,
                              base2[base2['LeagueIndex']==5].ComplexUnitsMade,
                              base2[base2['LeagueIndex']==6].ComplexUnitsMade,
                              base2[base2['LeagueIndex']==7].ComplexUnitsMade,
                              base2[base2['LeagueIndex']==8].ComplexUnitsMade)
f_CoUM, p_CoUM

(16.600544880285657, 1.1540968028190996e-21)

expect lots of keeps

In [32]:
mc_CoUM = MultiComparison(base2['ComplexUnitsMade'],base2['LeagueIndex'])
result_CoUM = mc_CoUM.tukeyhsd()

print(result_CoUM)
print(mc_CoUM.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2      0.0    0.9    -0.0    0.0  False
     1      3      0.0 0.0379     0.0 0.0001   True
     1      4   0.0001  0.001     0.0 0.0001   True
     1      5   0.0001  0.001     0.0 0.0001   True
     1      6   0.0001  0.001     0.0 0.0001   True
     1      7   0.0001 0.0222     0.0 0.0001   True
     1      8      0.0    0.9    -0.0 0.0001  False
     2      3      0.0 0.1062    -0.0    0.0  False
     2      4      0.0  0.001     0.0 0.0001   True
     2      5   0.0001  0.001     0.0 0.0001   True
     2      6   0.0001  0.001     0.0 0.0001   True
     2      7   0.0001 0.0616    -0.0 0.0001  False
     2      8      0.0    0.9    -0.0    0.0  False
     3      4      0.0 0.0212     0.0    0.0   True
     3      5      0.0  0.001     0.0    0.0   True
     3      6      0.0  0.001     0.0 0.0001   True
     3      

Keep null on: Bronze-Silver, Bronze-Pro, Silver-Gold, Silver-GM, Silver-Pro, Gold-GM through Diamond-GM, Master-GM, GM-Pro

#### Pros vs. All: ComplexAbilitiesUsed

In [33]:
f_CAU, p_CAU = stats.f_oneway(base2[base2['LeagueIndex']==1].ComplexAbilitiesUsed,
                              base2[base2['LeagueIndex']==2].ComplexAbilitiesUsed,
                              base2[base2['LeagueIndex']==3].ComplexAbilitiesUsed,
                              base2[base2['LeagueIndex']==4].ComplexAbilitiesUsed,
                              base2[base2['LeagueIndex']==5].ComplexAbilitiesUsed,
                              base2[base2['LeagueIndex']==6].ComplexAbilitiesUsed,
                              base2[base2['LeagueIndex']==7].ComplexAbilitiesUsed,
                              base2[base2['LeagueIndex']==8].ComplexAbilitiesUsed)
f_CAU, p_CAU

(12.556734905093624, 5.50643511344855e-16)

here comes lots of nulls!

In [34]:
mc_CAU = MultiComparison(base2['ComplexAbilitiesUsed'],base2['LeagueIndex'])
result_CAU = mc_CAU.tukeyhsd()

print(result_CAU)
print(mc_CAU.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      2      0.0 0.8672    -0.0 0.0001  False
     1      3   0.0001 0.0277     0.0 0.0001   True
     1      4   0.0001  0.001     0.0 0.0002   True
     1      5   0.0001  0.001  0.0001 0.0002   True
     1      6   0.0001  0.001  0.0001 0.0002   True
     1      7   0.0002  0.001  0.0001 0.0004   True
     1      8   0.0001 0.3002    -0.0 0.0002  False
     2      3      0.0 0.2954    -0.0 0.0001  False
     2      4   0.0001 0.0049     0.0 0.0001   True
     2      5   0.0001  0.001     0.0 0.0002   True
     2      6   0.0001  0.001  0.0001 0.0002   True
     2      7   0.0002  0.001  0.0001 0.0003   True
     2      8   0.0001 0.7406 -0.0001 0.0002  False
     3      4      0.0 0.7962    -0.0 0.0001  False
     3      5   0.0001 0.0011     0.0 0.0001   True
     3      6   0.0001  0.001     0.0 0.0001   True
     3      

Keep null on: Bronze-Silver, Bronze-Pro, Silver-Gold, Silver-Pro, Gold-Platinum, Platinum-Diamond, all combinations of Diamond-GM or higher

### Look at comparisons of 3 non-pro features

In [35]:
base3 = base2[base2['LeagueIndex'] <8]
base3 = base3.dropna(axis=0)

#### All on Age

In [36]:
f_Age, p_Age = stats.f_oneway(base3[base3['LeagueIndex']==1].Age,
                              base3[base3['LeagueIndex']==2].Age,
                              base3[base3['LeagueIndex']==3].Age,
                              base3[base3['LeagueIndex']==4].Age,
                              base3[base3['LeagueIndex']==5].Age,
                              base3[base3['LeagueIndex']==6].Age,
                              base3[base3['LeagueIndex']==7].Age)
f_Age, p_Age

(10.814915963158274, 5.984201730651382e-12)

lots of null hypotheses will be kept

In [37]:
mc_Age = MultiComparison(base3['Age'],base3['LeagueIndex'])
result_Age = mc_Age.tukeyhsd()

print(result_Age)
print(mc_Age.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     1      2  -0.6094 0.6887 -1.7702  0.5513  False
     1      3  -0.7144 0.4583  -1.803  0.3741  False
     1      4  -0.7836 0.2924 -1.8314  0.2643  False
     1      5   -1.389 0.0018 -2.4377 -0.3403   True
     1      6  -2.0871  0.001 -3.1618 -1.0124   True
     1      7  -1.5936 0.3816 -3.8814  0.6942  False
     2      3   -0.105    0.9 -0.9473  0.7374  False
     2      4  -0.1741    0.9 -0.9631  0.6149  False
     2      5  -0.7795 0.0561 -1.5697  0.0107  False
     2      6  -1.4777  0.001 -2.3021 -0.6533   True
     2      7  -0.9842 0.8168 -3.1656  1.1972  False
     3      4  -0.0691    0.9 -0.7475  0.6092  False
     3      5  -0.6745 0.0533 -1.3542  0.0052  False
     3      6  -1.3727  0.001 -2.0919 -0.6535   True
     3      7  -0.8792  0.888 -3.0231  1.2647  False
     4      5  -0.6054  0.055 -1.2177  0.0069 

Only reject on: Bronze-Diamond, Bronze-Master, Silver-Master, Gold-Diamond, Gold-Master, Platinum-Diamond, Platinum-Master and Diamond-Master

#### All on HoursPerWeek

In [38]:
f_HPW, p_HPW = stats.f_oneway(base3[base3['LeagueIndex']==1].HoursPerWeek,
                              base3[base3['LeagueIndex']==2].HoursPerWeek,
                              base3[base3['LeagueIndex']==3].HoursPerWeek,
                              base3[base3['LeagueIndex']==4].HoursPerWeek,
                              base3[base3['LeagueIndex']==5].HoursPerWeek,
                              base3[base3['LeagueIndex']==6].HoursPerWeek,
                              base3[base3['LeagueIndex']==7].HoursPerWeek
                             )
f_HPW, p_HPW

(42.70960655643759, 1.7764947538317148e-50)

Some nulls kept

In [39]:
mc_HPW = MultiComparison(base3['HoursPerWeek'],base3['LeagueIndex'])
result_HPW = mc_HPW.tukeyhsd()

print(result_HPW)
print(mc_HPW.groupsunique)

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
     1      2   0.1884    0.9  -3.025  3.4017  False
     1      3   0.8409    0.9 -2.1725  3.8544  False
     1      4   0.9138    0.9 -1.9869  3.8145  False
     1      5   3.0609 0.0311  0.1578  5.9641   True
     1      6   7.9801  0.001   5.005 10.9553   True
     1      7  18.6059  0.001 12.2726 24.9391   True
     2      3   0.6525    0.9 -1.6794  2.9845  False
     2      4   0.7254    0.9 -1.4589  2.9096  False
     2      5   2.8725 0.0021  0.6851    5.06   True
     2      6   7.7917  0.001  5.5096 10.0739   True
     2      7  18.4175  0.001 12.3786 24.4563   True
     3      4   0.0728    0.9  -1.805  1.9506  False
     3      5     2.22 0.0091  0.3384  4.1016   True
     3      6   7.1392  0.001  5.1483  9.1301   True
     3      7  17.7649  0.001 11.8301 23.6998   True
     4      5   2.1472 0.0036   0.452  3.8423 

Keep null on: Bronze-Silver, Bronze-Gold, Bronze-Platinum, Silver-Gold, Silver-Platinum, Gold-Platnium (i.e. no difference between the first 4 levels)

#### All on TotalHours

In [40]:
f_TH, p_TH = stats.f_oneway(base3[base3['LeagueIndex']==1].TotalHours,
                              base3[base3['LeagueIndex']==2].TotalHours,
                              base3[base3['LeagueIndex']==3].TotalHours,
                              base3[base3['LeagueIndex']==4].TotalHours,
                              base3[base3['LeagueIndex']==5].TotalHours,
                              base3[base3['LeagueIndex']==6].TotalHours,
                              base3[base3['LeagueIndex']==7].TotalHours
                             )
f_TH, p_TH

(47.088944399386186, 1.1188011029115706e-55)

will expect to keep some nulls

In [41]:
mc_TH = MultiComparison(base3['TotalHours'],base3['LeagueIndex'])
result_TH = mc_TH.tukeyhsd()

print(result_TH)
print(mc_TH.groupsunique)

   Multiple Comparison of Means - Tukey HSD, FWER=0.05   
group1 group2  meandiff p-adj    lower     upper   reject
---------------------------------------------------------
     1      2    66.108    0.9 -163.0124  295.2285  False
     1      3  228.4908 0.0286   13.6228  443.3588   True
     1      4   322.705  0.001  115.8782  529.5317   True
     1      5  516.3837  0.001   309.382  723.3855   True
     1      6  723.1046  0.001    510.97  935.2392   True
     1      7 1315.7274  0.001  864.1482 1767.3065   True
     2      3  162.3828 0.0609   -3.8887  328.6543  False
     2      4  256.5969  0.001  100.8559   412.338   True
     2      5  450.2757  0.001  294.3024   606.249   True
     2      6  656.9966  0.001  494.2728  819.7204   True
     2      7 1249.6193  0.001  819.0364 1680.2023   True
     3      4   94.2141 0.3684   -39.679  228.1072  False
     3      5  287.8929  0.001  153.7297  422.0561   True
     3      6  494.6138  0.001  352.6591  636.5684   True
     3      7 

Keep null on: Bronze-Silver, Silver-Gold, Gold-Platinum