# Chi-Square Test-
The test is applied when you have two categorical variables from a single population. It is used to determine whether there is a significant association between the two variables.

In [1]:
import scipy.stats as stats
import numpy as np
import pandas as pd
import seaborn as sns


In [3]:
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
#Rship between sex and smoker

# Use pd.crosstab for establishing the chisquare test rship
# STEP 1 : Pick up 2 categorical features 
# Step 2 : Pick up inbuilt function - cross tab creates a matrix 

df_table= pd.crosstab(df['sex'],df['smoker'])
df_table

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [5]:
df_table.values

array([[60, 97],
       [33, 54]], dtype=int64)

In [6]:
type(df_table)

pandas.core.frame.DataFrame

In [19]:
Observed_Values=df_table.values
print ("Observed_values: \n" ,Observed_Values)

Observed_values: 
 [[60 97]
 [33 54]]


### stats.chi2_contingency - 
This function computes the chi-square statistic and p-value for the
hypothesis test of independence of the observed frequencies in the
contingency table [1]_ `observed`'''



In [20]:

value=stats.chi2_contingency(df_table)
value

(0.008763290531773594, 0.925417020494423, 1, array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [25]:
Expected_Values=value[3]

In [26]:
rows=len(df_table.iloc[0:2,0])
columns= len(df_table.iloc[0,0:2])
dof=(rows-1) * (columns-1)
print( "Degree of freedom :" , dof)
alpha=0.5

Degree of freedom : 1


In [27]:
# formula for chisquare X^2 = Sum [(observed-expected)^2 / expected]

In [29]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
print (chi_square)
chi_square_statistic=chi_square[0]+chi_square[1]
print("chi-square statistic:",chi_square_statistic)

[0.00119737 0.00073745]
chi-square statistic: 0.001934818536627623


In [31]:
critical_value=chi2.ppf(q=1-alpha,df=dof)
print('critical_value:',critical_value)

critical_value: 0.4549364231195725


In [32]:
if chi_square_statistic>=critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

Retain H0,There is no relationship between 2 categorical variables


In [34]:
#p-value - 2nd way of computing
p_value=1-chi2.cdf(x=chi_square_statistic,df=dof)
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',dof)


if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")



p-value: 0.964915107315732
Significance level:  0.5
Degree of Freedom:  1
Retain H0,There is no relationship between 2 categorical variables
