Unlike other "scratch" code, this is to clarify statistical test use cases.

In [1]:
import random
from scipy.stats import ttest_1samp

# ONE SAMPLE T-TEST
# test whether a sample mean (of a normally distributed interval variable) 
# significantly differs from a hypothesized value
a = [random.gauss(0.25,1) for _ in range(100)]
popmean = 0
ttest_1samp(a, popmean)

Ttest_1sampResult(statistic=2.2228421603740935, pvalue=0.02850124526269893)

In [9]:
import random
from scipy.stats import binom_test

# BINOMIAL TEST
# test allows us to test whether the proportion of successes on a 
# two-level categorical dependent variable significantly differs from a hypothesized value
# for data, 1 could be 'heads', 0 could be 'tails'
data = [1 if random.uniform(0,1) < 0.59 else 0 for _ in range(1000)]
binom_test(sum(data), n=len(data), p=0.5)

3.288289474277995e-11

In [17]:
import random
import numpy as np
from scipy.stats import chisquare

# CHI-SQUARE GOODNESS OF FIT
# test whether the observed proportions for a categorical variable 
# differ from hypothesized proportions
data = np.array( [random.choice([1, 2, 2, 3, 3, 3]) for _ in range(100)] )
unique_elements, counts_elements = np.unique(data, return_counts=True)
chisquare(counts_elements)

Power_divergenceResult(statistic=26.419999999999995, pvalue=1.8321874080019842e-06)

In [18]:
import random
from scipy.stats import ttest_ind

# TWO INDEPENDENT SAMPLES T-TEST
# compare the means of a normally distributed interval dependent variable 
# for two independent groups
a = [random.gauss(0.25, 1) for _ in range(100)]
b = [random.gauss(0, 1) for _ in range(80)]
ttest_ind(a, b)

Ttest_indResult(statistic=2.6568816123428745, pvalue=0.00860374034623511)

In [20]:
import random
from scipy.stats import mannwhitneyu

# Wilcoxon-Mann-Whitney test 
# non-parametric analog to the independent samples t-test 
# can be used when you do not assume that the dependent variable is a normally distributed interval variable 
# you only assume that the variable is at least ordinal
a = [random.gauss(0.25, 1) for _ in range(1000)]
b = [random.gauss(0, 1) for _ in range(800)]
mannwhitneyu(a, b)

MannwhitneyuResult(statistic=322823.0, pvalue=9.389522668291871e-13)

In [30]:
import random
from scipy.stats import chi2_contingency

# chi-square test is used when you want to see if there is a relationship between two categorical variables
# chi-square test assumes that the expected value for each cell is 5 or higher
# if this assumption is not met, use Fisher’s exact test
a, b, c, d = 0, 0, 0, 0
for _ in range(50):
    if random.uniform(0,1) < 0.2:
        c += 1
    else:
        a += 1
for _ in range(50):
    if random.uniform(0,1) < 0.2:
        b += 1
    else:
        d += 1

obs = np.array([[a, b], [c, d]])
print(obs)

chi, pval, df, exp = chi2_contingency(obs, correction=False)
chi, pval

[[40 14]
 [10 36]]


(27.214170692431566, 1.8211894358376534e-07)

In [40]:
import random
from scipy.stats import f_oneway

# one-way analysis of variance (ANOVA) 
# used when you have a categorical independent variable (with two or more categories) 
# and a normally distributed interval dependent variable 
# and you wish to test for differences in the means of the dependent variable 
# broken down by the levels of the independent variable
std = 3
category_level_1 = [random.gauss(0, std) for _ in range(100)]
category_level_2 = [random.gauss(0.5, std) for _ in range(80)]
category_level_3 = [random.gauss(0.75, std) for _ in range(90)]
Fstat, pval = f_oneway(category_level_1, category_level_2, category_level_3)
Fstat, pval

(4.503918732180759, 0.011919130431089074)

In [41]:
# Analysis of covariance is like ANOVA, except in addition to the categorical predictors 
# you also have continuous predictors as well

# MANOVA (multivariate analysis of variance) is like ANOVA, except that there are two or more dependent variables

# Kruskal Wallis test is used when you have one independent variable with two or more levels 
# and an ordinal dependent variable
# non-parametric version of ANOVA

In [42]:
# paired (samples) t-test is used when you have two related observations (i.e., two observations per subject) 
# and you want to see if the means on these two normally distributed interval variables differ from one another

from scipy.stats import ttest_rel

# Wilcoxon signed rank sum test is the non-parametric version of a paired samples t-test
# use the Wilcoxon signed rank sum test when you do not wish to assume that 
# the difference between the two variables is interval and normally distributed 
# (but you do assume the difference is ordinal)

from scipy.stats import wilcoxon

In [43]:
# marginal frequencies of two binary outcomes, could be
# same outcome & same group, e.g., before and after some tx
# same outcome & different group / matched pairs, e.g., case control
# two outcome variables & single group, e.g., accepted at college X / accepted at college Y
from statsmodels.stats.contingency_tables import mcnemar


# one-way repeated measures analysis of variance if you had one categorical independent variable 
# and a normally distributed interval dependent variable that was repeated at least twice for each subject
# the equivalent of the paired samples t-test, but allows for two or more levels of the categorical variable
# tests whether the mean of the dependent variable differs by the categorical variable
from statsmodels.stats.anova import AnovaRM

# factorial ANOVA has two or more categorical independent variables (either with or without the interactions) 
# and a single normally distributed interval dependent variable


In [None]:

# correlation is useful when you want to see the relationship between 
# two (or more) normally distributed interval variables
# assume x and y are Pandas Series
x.corr(y, method='pearson') # linear correlation
x.corr(y, method='spearman') # ranks of data comparison
x.corr(y, method='kendall') # ranks of data comparison

# Spearman correlation is used when one or both of the variables are not assumed to be 
# normally distributed and interval (but are assumed to be ordinal)

# linear relationship between one normally distributed interval predictor and 
# one normally distributed interval outcome variable
from sklearn.linear_model import LinearRegression