# _Statistics Tutorial in Python_
***

In [6]:
#usual imports
import numpy as np
import math
#visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

## _1. Normality Test_ 
Check whether the data has a Gaussian Distribution
***
- Shapiro Wilk Test
- D Augustiono K^2 Test
- Anderson Darling Test


### _Shapiro Wilk Test_
- H0: the sample has a Gaussian distribution.
- H1: the sample does not have a Gaussian distribution.

Assumptions:
- Observations in each sample are independent and identically distributed. 

In [7]:
#example of the shapiro wilk test for normality
from scipy.stats import shapiro
data = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
stat, p = shapiro(data)

In [8]:
print('stat:%.3f p:%.3f'%(stat,p))
if p > 0.05:
    print('Probably data is Gaussian')
else:
    print('Probably data is not Gaussian')

stat:0.895 p:0.193
Probably data is Gaussian


### _D Agostino K^2 Test_
- H0: the sample has a Gaussian distribution.
- H1: the sample does not have a Gaussian distribution.

Assumptions:
- Observations in each sample are independent and identically distributed. 

In [9]:
#example of the D'Agostino K^2 test
from scipy.stats import normaltest
data = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
stat,p = normaltest(data)
print('stat:%.3f p:%.3f'%(stat,p))
if p > 0.05:
    print('Probably data is Gaussian')
else:
    print('Probably data is not Gaussian')

stat:3.392 p:0.183
Probably data is Gaussian


### _Anderson Darling Test_
- H0: the sample has a Gaussian distribution.
- H1: the sample does not have a Gaussian distribution.

Assumptions:
- Observations in each sample are independent and identically distributed. 

In [12]:
#example of the anderson darling test for normality
from scipy.stats import anderson
data = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
result = anderson(data)

In [13]:
#print the result of the anderson test 
print (result)

AndersonResult(statistic=0.4239737141854807, critical_values=array([0.501, 0.57 , 0.684, 0.798, 0.95 ]), significance_level=array([15. , 10. ,  5. ,  2.5,  1. ]))


In [16]:
#print the stat
print(f'stat {result.statistic}')

stat 0.4239737141854807


In [17]:
#print the critical values
print(f'critical values {result.critical_values}')

critical values [0.501 0.57  0.684 0.798 0.95 ]


In [19]:
#print the significance levels
print(f'significance levels {result.significance_level}')

significance levels [15.  10.   5.   2.5  1. ]


In [21]:
for i in range(len(result.critical_values)):
    sl,cv = result.significance_level[i], result.critical_values[i]
    if result.statistic < result.critical_values[i]:
        print('the data looks normal (fail to reject H0)')
    else:
        print('the data does not look normal (reject H0)')

the data looks normal (fail to reject H0)
the data looks normal (fail to reject H0)
the data looks normal (fail to reject H0)
the data looks normal (fail to reject H0)
the data looks normal (fail to reject H0)


## _2. Correlation Test_


### _Pearson Correlation Coefficient_
- H0: the two samples are independent
- H1: there is a dependency between the samples

Assumptions:
- Observations in each sample are independent and identically distributed. 
- Obseravations in each sample are normally distributed
- Observations in each sample have the same variance

In [22]:
#example of the pearson correlation coefficient
from scipy.stats import pearsonr
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [0.353, 3.517, 0.125, -7.545, -0.555, -1.536, 3.350, -1.578, -3.537, -1.579]

stat,p = pearsonr(data1,data2)
if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The two samples are independent')
    
else:
    print('Reject the Null Hypothesis')
    print('The two samples are dependent')
    

Reject the Null Hypothesis
The two samples are dependent


### _Spearman's Rank Correlation_
- H0: the two samples are independent
- H1: there is a dependency between the samples

Assumptions:
- Observations in each sample are independent and identically distributed. 
- Obseravations in each sample can be ranked



In [23]:
#example of the spearman correlation
from scipy.stats import spearmanr
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [0.353, 3.517, 0.125, -7.545, -0.555, -1.536, 3.350, -1.578, -3.537, -1.579]

stat,p = spearmanr(data1,data2)

if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The two samples are independent')
    
else:
    print('Reject the Null Hypothesis')
    print('The two samples are dependent')

Reject the Null Hypothesis
The two samples are dependent


### _Kendall's Rank Correlation_
- H0: the two samples are independent
- H1: there is a dependency between the samples

Assumptions:
- Observations in each sample are independent and identically distributed. 
- Obseravations in each sample can be ranked


In [24]:
#example of the kendal rank correlation
from scipy.stats import kendalltau
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [0.353, 3.517, 0.125, -7.545, -0.555, -1.536, 3.350, -1.578, -3.537, -1.579]
stat,p = kendalltau(data1,data2)

if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The two samples are independent')
    
else:
    print('Reject the Null Hypothesis')
    print('The two samples are dependent')

Reject the Null Hypothesis
The two samples are dependent


### _Chi-Squared Test_
Test whether the two categorical variable are related or independent
- H0: the two samples are independent
- H1: there is a dependency between the samples

Assumptions:
- Observations used in the calculation of the contingency table are independent
- 25 or more examples in each cell of the contingency table 

In [28]:
#example of the chi-squared test
from scipy.stats import chi2_contingency
table = [[10, 20, 30],[6,  9,  17]]

stat,p,dof,expected = chi2_contingency(table)

if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The two samples are independent')
    
else:
    print('Reject the Null Hypothesis')
    print('The two samples are dependent')

Fail to reject the Null Hypothesis 
The two samples are independent


## _3. Stationary Tests_
_Test whether the time series is stationary or not_

## _Augmented Dickey Fuller Unit Root Test_
_Tests whether the time series has a unit roor or more generally whether the time series has trend or is autoregressive_

Assumptions:
- Observations are temporally ordered

Interpretation
- H0: a unit root is present and the series is non stationary
- H1: a unit root is not present and the series is stationary


In [29]:
#example of the augmented dickey fuller test .. 
from statsmodels.tsa.stattools import adfuller
data = [1,2,3,4,5,6,7,8,9]

stat,p,lags,obs,crit,t = adfuller(data)
if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The time series has unit root and is non stationary')
    
else:
    print('Reject the Null Hypothesis')
    print('The times series has no unit root and is stationary')

Fail to reject the Null Hypothesis 
The time series has unit root and is non stationary


## _Kwiatowski Schmidt Shin Test_
_Tests whether the time series is trend stationary or not_

Assumptions:
- Observations are temporally ordered

Interpretation
- H0: the time series is not trend stationary
- H1: the time series is trend stationary


In [30]:
#example of the kpss test .. 
from statsmodels.tsa.stattools import kpss
data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
stat,p,lags,crit =  kpss(data)

if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The time series is non stationary')
    
else:
    print('Reject the Null Hypothesis')
    print('The times series is stationary')

Fail to reject the Null Hypothesis 
The time series is non stationary


## _4. Parametric Statistical Hypothesis Tests_
_The statistical tests that can be used to compare the data sample_

## _Student t-test_
_Test whether the means of two independent sample are significantly different_

Assumptions:
- Observations in each sample are independent and identically distributed
- Observations in each sample are normally distributed
- Observations in each sample have the same variance


Interpretation
- H0: the means of the sample are equal
- H1: the means of the sample are unequal

In [32]:
#example of the student t test
from scipy.stats import ttest_ind
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat,p = ttest_ind(data1,data2)

if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The samples have same mean and probably they are from same distribution')
    
else:
    print('Reject the Null Hypothesis')
    print('The samples have unequal means and probably they are from different distributions')

Fail to reject the Null Hypothesis 
The samples have same mean and probably they are from same distribution


## _Paired Student t-test_
_Test whether the means of two paired samples are significantly different_

Assumptions:
- Observations in each sample are independent and identically distributed
- Observations in each sample are normally distributed
- Observations in each sample have the same variance
- Observations across each sample are paired


Interpretation
- H0: the means of the sample are equal
- H1: the means of the sample are unequal

In [34]:
#example of the paired t-test
from scipy.stats import ttest_rel
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat,p = ttest_rel(data1,data2)

if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The samples have same mean and probably they are from same distribution')
    
else:
    print('Reject the Null Hypothesis')
    print('The samples have unequal means and probably they are from different distributions')

Fail to reject the Null Hypothesis 
The samples have same mean and probably they are from same distribution


## _ANOVA or Analysis of Variance_
_tests whether the means of two or more independent samples are equal or different_

Assumptions:
- Observations in each sample are independent and identically distributed
- Observations in each sample are normally distributed
- Observations in each sample have same variance

Interpretation
- H0: the means of the sample are equal
- H1: the means of the sample are unequal

In [35]:
#example of the one way ANOVA
from scipy.stats import f_oneway
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]

stat,p = f_oneway(data1,data2,data3)
if p > 0.05:
    print('Fail to reject the Null Hypothesis ')
    print('The samples have same mean and probably they are from same distribution')
    
else:
    print('Reject the Null Hypothesis')
    print('The samples have unequal means and probably they are from different distributions')

Fail to reject the Null Hypothesis 
The samples have same mean and probably they are from same distribution


## _Repeated Measures ANOVA Test_
_Test whether the means of two or more paired samples are significantly different_

Assumptions:
- Observations in each sample are independent and identically distributed
- Observations in each sample are normally distributed
- Observations in each sample have the same variance
- Observations across each sample are paired


Interpretation
- H0: the means of the sample are equal
- H1: the means of the sample are unequal

Python Code
- Currently this test is not supported in Python

## _5. Non Parametric Statistical Hypothesis Tests_


## _Mann Whitney U Test_
_Test whether the distributions of two independent samples are equal or not_

Assumptions:
- Observations in each sample are independent and identically distributed
- Observations in each sample can be ranked

Interpretation
- H0: the distributions of both the samples are equal
- H1: the distributions of both the samples are not equal

In [37]:
#example of the implementation of the Mann Whitney U Test
from scipy.stats import mannwhitneyu
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = mannwhitneyu(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=40.000, p=0.236
Probably the same distribution


## _Wilcoxon Signed-Rank Test_
_Test whether the distributions of two paired samples are equal or not_

Assumptions:
- Observations in each sample are independent and identically distributed
- Observations in each sample can be ranked
- Observations across each sample are paired

Interpretation
- H0: the distributions of both the samples are equal
- H1: the distributions of both the samples are not equal


In [38]:
#example of the wilcoxon test
from scipy.stats import wilcoxon
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = wilcoxon(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=21.000, p=0.508
Probably the same distribution


## _Krushkal Wallis H Test_
_Test whether the distributions of two or more independent samples are equal or not_

Assumptions:
- Observations in each sample are independent and identically distributed
- Observations in each sample can be ranked

Interpretation
- H0: the distributions of both the samples are equal
- H1: the distributions of both the samples are not equal

In [40]:
#python example of the kruskal test
from scipy.stats import kruskal
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = kruskal(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=0.571, p=0.450
Probably the same distribution


## _Friedman Test_
_Test whether the distributions of two or more paired samples are equal or not_

Assumptions:
- Observations in each sample are independent and identically distributed
- Observations in each sample can be ranked
- Observations in each sample are paired

Interpretation
- H0: the distributions of all the samples are equal
- H1: the distributions of one or more of samples are not equal

In [41]:
#example of the friedman test
from scipy.stats import friedmanchisquare
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]
stat, p = friedmanchisquare(data1, data2, data3)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=0.800, p=0.670
Probably the same distribution
