<a href="https://colab.research.google.com/github/rahulsait/ExcelR-codes/blob/main/CHI_SQUARE_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **The Test is applied when you have two categorical variables from a single population.**
## **It is used to determine whether there is a significant association between two variables.**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
from scipy import stats

In [2]:
# loading an in built data set from seaborn
ds = sns.load_dataset("tips")

In [3]:
ds.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# picking the two categorical columns :
ds_table = pd.crosstab(ds["sex"], ds["smoker"])
print(ds_table)

smoker  Yes  No
sex            
Male     60  97
Female   33  54


In [5]:
ds_table.values

array([[60, 97],
       [33, 54]])

In [8]:
observed_values = ds_table.values
print("Observed Values : \n", observed_values)

Observed Values : 
 [[60 97]
 [33 54]]


In [9]:
val = stats.chi2_contingency(ds_table)

In [10]:
val

(0.0, 1.0, 1, array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [12]:
expected_values = val[3]
print("Expected Values : \n", expected_values)

Expected Values : 
 [[59.84016393 97.15983607]
 [33.15983607 53.84016393]]


In [13]:
# finding no. of rows :
no_of_rows = len(ds_table.iloc[0:2,0])
# finding no. of columns :
no_of_columns = len(ds_table.iloc[0,0:2])

In [15]:
print(no_of_rows)
print(no_of_columns)

2
2


In [16]:
# degrees of freedom = (no. of rows-1) * (no. of columns - 1)
dof = (no_of_rows - 1)*(no_of_columns - 1)
print("Degrees of Freedom : \n", dof)

Degrees of Freedom : 
 1


In [17]:
alpha = 0.05 #(significance value)


In [19]:
from scipy.stats import chi2
chi_square = sum([(o - e)**2./e for o,e in zip(observed_values, expected_values)])

In [20]:
chi_square

array([0.00119737, 0.00073745])

In [23]:
# adding up the above two rows to get the chi square test statistic :
chi_square_statistic = chi_square[0] + chi_square[1]
print("Chi-square statistic : \n", chi_square_statistic)

Chi-square statistic : 
 0.001934818536627623


In [25]:
# computing the chi-square critical value :
critical_value = chi2.ppf(q=1-alpha, df = dof)
print("Critical Value : \n", critical_value)

Critical Value : 
 3.841458820694124


In [27]:
# computing P-value :
p_value = 1 - chi2.cdf(x = chi_square_statistic, df = dof)
print("p-value : \n", p_value)

p-value : 
 0.964915107315732


In [28]:
print("Chi-Square Test Statistic : ", chi_square_statistic)
print("Critical value : ", critical_value)
print("p-value : ", p_value)
print("Significance value : ", alpha)

Chi-Square Test Statistic :  0.001934818536627623
Critical value :  3.841458820694124
p-value :  0.964915107315732
Significance value :  0.05


In [29]:
# Method 1 :
# comparing chi square test statistic and critical value :
if chi_square_statistic >= critical_value :
  print("Reject Ho - There is a significant relation between the variables")
else :
  print("Accept Ho - No significant relation between the variables")

Accept Ho - No significant relation between the variables


In [30]:
# Method 2 :
# comparing p-value and significance vaalue :
if p_value >= alpha :
  print("Accept Ho - No significant relation between the variables")
else :
  print("Reject Ho - There is a significant relation between the variables")

Accept Ho - No significant relation between the variables
