In [3]:
import pandas as pd
import numpy as np

from scipy import stats

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [19]:
data = pd.read_table('brain_size.csv', delimiter=';', index_col=0)

In [20]:
data.head()

Unnamed: 0,Gender,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
1,Female,133,132,124,118,64.5,816932
2,Male,140,150,124,.,72.5,1001121
3,Male,139,123,150,143,73.3,1038437
4,Male,133,129,128,172,68.8,965353
5,Female,137,132,134,147,65.0,951545


In [21]:
data['VIQ'].describe()

count     40.000000
mean     112.350000
std       23.616107
min       71.000000
25%       90.000000
50%      113.000000
75%      129.750000
max      150.000000
Name: VIQ, dtype: float64

# 1. t-test

Assumption:
* Gaussian errors

## 1.1. One sample t-test

In [22]:
t, pvalue = stats.ttest_1samp(data['VIQ'], 100)
pvalue

0.0020301174047818219

## 1.2. Two sample t-test (independent case)

In [27]:
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']
t, pvalue = stats.ttest_ind(female_viq, male_viq)
pvalue

0.44452876778583217

## 1.3. Two sample t-test (dependent case)

In [29]:
t, pvalue = stats.ttest_rel(data['FSIQ'], data['PIQ'])
pvalue

0.082172638183642358

In [30]:
t, pvalue = stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0)
pvalue

0.082172638183642358

# 2. Wilcoxon signed-rank test
No assumption on Gaussian

In [31]:
t, p = stats.wilcoxon(data['FSIQ'], data['PIQ'])
p

0.10659492713506856

In [35]:
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()
results.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.348
Model:,OLS,Adj. R-squared:,0.333
Method:,Least Squares,F-statistic:,22.2
Date:,"Mon, 28 Nov 2016",Prob (F-statistic):,1.9e-08
Time:,21:57:41,Log-Likelihood:,-379.82
No. Observations:,86,AIC:,765.6
Df Residuals:,83,BIC:,773.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,246.4341,35.233,6.995,0.000,176.358 316.510
Literacy,-0.4889,0.128,-3.832,0.000,-0.743 -0.235
np.log(Pop1831),-31.3114,5.977,-5.239,0.000,-43.199 -19.424

0,1,2,3
Omnibus:,3.713,Durbin-Watson:,2.019
Prob(Omnibus):,0.156,Jarque-Bera (JB):,3.394
Skew:,-0.487,Prob(JB):,0.183
Kurtosis:,3.003,Cond. No.,702.0


In [36]:
df2 = dat = sm.datasets.get_rdataset("Titanic", "datasets").data

In [39]:
df2.sample(10)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
972,"Lobb, Mr William Arthur",3rd,,male,0,0
555,"Siukonnen, Miss Anna",2nd,30.0,female,1,1
1028,"Moran, Miss Bertha",3rd,,female,1,1
1025,"Moor, Mrs Beila",3rd,,female,1,1
465,"Karnes, Mrs J Frank (Claire Bennett)",2nd,22.0,female,0,1
1127,"Peter (Joseph), Mrs Catherine",3rd,,female,0,1
811,"Fox, Mr Patrick",3rd,,male,0,0
20,"Behr, Mr Karl Howell",1st,26.0,male,1,0
694,"Burke, Mr Jeremiah",3rd,19.0,male,0,0
268,"Van Derhoef, Mr Wyckoff",1st,61.0,male,0,0


In [40]:
df2.shape

(1313, 6)

# ANOVA

In [8]:
data = sm.datasets.get_rdataset("Moore", "car", cache=True).data
data = data.rename(columns={"partner.status" : "partner_status"})
moore_lm = smf.ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)', data=data).fit()
table = sm.stats.anova_lm(moore_lm, typ=2)
table

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(fcategory, Sum)",11.6147,2.0,0.276958,0.759564
"C(partner_status, Sum)",212.213778,1.0,10.120692,0.002874
"C(fcategory, Sum):C(partner_status, Sum)",175.488928,2.0,4.184623,0.022572
Residual,817.763961,39.0,,
