## Chi square test for population variance

In [3]:
import pandas as pd
import os
import stats_distributions as stds
import statsmodels.stats.weightstats as stsm
import numpy as np
from numpy import sqrt, abs, round
from scipy.stats import norm

from scipy import stats


In [4]:
### function to  get chi2 statistic and corresponding p value for the test ###
# DO NOT USE P_VALUE FOR TWO TAILED TEST
def chi2_test(n, var_sam, var_pop, tail):
    
    stat = (n - 1) * var_sam/var_pop
    
    p_val = stds.chi2_to_pval(stat, n-1, tail)
            
    
    return (stat, p_val)
    
    
    

In [11]:
def chi2_CI(alpha, n, var_sam):
    crit_val_low = stds.pval_to_chi2(alpha/2, n-1, "left")
    crit_val_up = stds.pval_to_chi2(alpha/2, n-1, "right")
    
    ci_low = (n-1) * var_sam / crit_val_up
    ci_up = (n-1) * var_sam / crit_val_low
    
    print( "critical value low: %.5f ; critical value low: %.5f ; CI value low: %.5f ;CI value low: %.5f ;\
          "%((crit_val_low, crit_val_up, ci_low, ci_up)))
    return (crit_val_low, crit_val_up, ci_low, ci_up)

In [12]:
chi2_CI(0.01, 12, 44.9)

critical value low: 2.60322 ; critical value low: 26.75685 ; CI value low: 18.45883 ;CI value low: 189.72643 ;          


(2.603221890515113, 26.756848916469636, 18.4588253101803, 189.72643161903858)

In [17]:
# eg:
alpha = 0.05
n = 25
var_sam = 175
var_hyp = 156
tail = "right"

test_stat, p_value = chi2_test(n, var_sam, var_hyp, tail)

# For two tailed test compare with alpha
# change to t if needed
print("test critical: %.3f ; alpha: %.3f" %(stds.pval_to_chi2(alpha,n-1, tail), alpha))

print("test stat: %.3f ; p_value: %.3f" %(test_stat, p_value))

### FOR TWO TAIL TEST DON'T USE P VALUE. USE chi2_CI() FUNCTION AND COMPARE STAT WITH CRITICAL VALUES
# TODO: if stat is closer to lower critical val, pval = left area else right area

# the function outputs a p_value and chi2-score corresponding to that value, we compare the 
# p-value with alpha, if it is greater than alpha then we do not null hypothesis 
# else we reject it.
  
if(p_value <  alpha):
    print("Reject Null Hypothesis")
else:
    print("Fail to Reject NUll Hypothesis")

test critical: 36.415 ; alpha: 0.050
test stat: 26.923 ; p_value: 0.308
Fail to Reject NUll Hypothesis


## GOODNESS OF FIT TEST

In [None]:
# H0: Unbiased or equally likely i.e. observed and expected are same, good fit
# H1: Biased is unfair, observed and expected are different
# Goodness-of-fit hypothesis tests are always right-tailed

In [20]:
import os
os.chdir("C:\\Users\\satish\\Desktop")

df = pd.read_excel("MSE_data.xlsx", sheet_name = "CHI2 Goodness of fit")

In [21]:
# Total frequency
total_freq = df['observed'].sum()
print('Total Frequency : ', total_freq)
# Expected frequency

expected_freq = total_freq / df.shape[0]
print('Expected Frequency : ', expected_freq) 

Total Frequency :  90
Expected Frequency :  15.0


In [22]:
df['expected'] = expected_freq
df 

Unnamed: 0,face,observed,expected
0,1,17,15.0
1,2,11,15.0
2,3,18,15.0
3,4,12,15.0
4,5,15,15.0
5,6,17,15.0


In [36]:
### using built-in scipy

# expected is optional as default is equally likely
test_stat, p_value = stats.chisquare(df['observed'], df['expected']) 
n = 6

print("test critical: %.5f ; alpha: %.5f" %(stds.pval_to_chi2(alpha, n-1, "right"), alpha))

print("test stat: %.3f ; p_value: %.3f" %(test_stat, p_value))

if(p_value <  alpha):
    print("Reject Null Hypothesis")
else:
    print("Fail to Reject Null Hypothesis")

test critical: 11.07050 ; alpha: 0.05000
test stat: 2.800 ; p_value: 0.731
Fail to Reject Null Hypothesis


In [None]:
# Implies dice is unbiased since obs = exp

In [37]:
stats.chisquare([47,3,1,0,3,11,3,3,1,1])

Power_divergenceResult(statistic=251.52054794520552, pvalue=4.76346501215303e-49)

In [None]:
## For Benford's law: P(d) = log10(d+1) – log10(d) -> only for digits 1 to 9
## Exp freq = sum(obs_freq * p(d)). Proceed as above
 

## Independence test Contingency table

In [None]:
# H0: The variables are independent
# H1: Variables are dependent with some relation

In [43]:
# generate a df with columns voter race and party denoting who voted for which party

np.random.seed(10)

# Sample data randomly at fixed probabilities
voter_race = np.random.choice(a= ["asian","black","hispanic","other","white"],
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

# Sample data randomly at fixed probabilities
voter_party = np.random.choice(a= ["democrat","independent","republican"],
                              p = [0.4, 0.2, 0.4],
                              size=1000)

voters = pd.DataFrame({"race":voter_race, 
                       "party":voter_party})


In [44]:
voters.describe()

Unnamed: 0,race,party
count,1000,1000
unique,5,3
top,white,republican
freq,497,417


In [45]:
# form contingency table
voter_tab = pd.crosstab(voters.race, voters.party, margins = True)

voter_tab.columns = ["democrat","independent","republican","row_totals"]

voter_tab.index = ["asian","black","hispanic","other","white","col_totals"]

observed = voter_tab.iloc[0:5,0:3]   # Get table without totals for later use
voter_tab

Unnamed: 0,democrat,independent,republican,row_totals
asian,21,7,32,60
black,65,25,64,154
hispanic,107,50,94,251
other,15,8,15,38
white,189,96,212,497
col_totals,397,186,417,1000


In [46]:
observed

Unnamed: 0,democrat,independent,republican
asian,21,7,32
black,65,25,64
hispanic,107,50,94
other,15,8,15
white,189,96,212


In [47]:
# The output shows the chi-square statistic, the p-value and the degrees of freedom followed by the expected counts.

test_stat, p_value, df, expected_counts = stats.chi2_contingency(observed = observed)

(7.169321280162059,
 0.518479392948842,
 8,
 array([[ 23.82 ,  11.16 ,  25.02 ],
        [ 61.138,  28.644,  64.218],
        [ 99.647,  46.686, 104.667],
        [ 15.086,   7.068,  15.846],
        [197.309,  92.442, 207.249]]))

In [48]:
print("test critical: %.5f ; alpha: %.5f" %(stds.pval_to_chi2(alpha, n-1, "right"), alpha))

print("test stat: %.3f ; p_value: %.3f" %(test_stat, p_value))

if(p_value <  alpha):
    print("Reject Null Hypothesis")
else:
    print("Fail to Reject Null Hypothesis")

test critical: 11.07050 ; alpha: 0.05000
test stat: 2.800 ; p_value: 0.731
Fail to Reject Null Hypothesis


In [49]:
# No relation between voters race and party

# Also used for test of homogeneity
# Also can use matrix directly:

stats.chi2_contingency([[560,308],[240, 92]])



(6.184241574593305,
 0.012889293542101093,
 1,
 array([[578.66666667, 289.33333333],
        [221.33333333, 110.66666667]]))