# Test of Independence / Test of Association / Chi-Square Test

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

## Titanic 

In [2]:
titanic = pd.read_csv('TitanicSurvival.csv')
titanic

Unnamed: 0,Name,survived,gender,passengerClass
0,"Allen, Miss. Elisabeth Walton",yes,female,1st
1,"Allison, Master. Hudson Trevor",yes,male,1st
2,"Allison, Miss. Helen Loraine",no,female,1st
3,"Allison, Mr. Hudson Joshua Crei",no,male,1st
4,"Allison, Mrs. Hudson J C (Bessi",no,female,1st
...,...,...,...,...
1304,"Zabour, Miss. Hileni",no,female,3rd
1305,"Zabour, Miss. Thamine",no,female,3rd
1306,"Zakarian, Mr. Mapriededer",no,male,3rd
1307,"Zakarian, Mr. Ortin",no,male,3rd


In [3]:
pd.crosstab(titanic.gender, titanic.survived)

survived,no,yes
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,127,339
male,682,161


In [4]:
df = pd.pivot_table(data = titanic,
               index = 'gender',
               columns ='survived',
               values = 'Name',
               aggfunc= 'count')

In [5]:
df

survived,no,yes
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,127,339
male,682,161


In [7]:
chi2, pvalue, df, expfreq = chi2_contingency(df)

In [8]:
pvalue

4.589924936952945e-81

In [None]:
# Reject NULL
# Variables are associated. Gender and Survived are associated.

In [9]:
df = pd.pivot_table(data = titanic,
               index = 'gender',
               columns =['survived','passengerClass'],
               values = 'Name',
               aggfunc= 'count')

In [10]:
df

survived,no,no,no,yes,yes,yes
passengerClass,1st,2nd,3rd,1st,2nd,3rd
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,5,12,110,139,94,106
male,118,146,418,61,25,75


In [11]:
chi2_contingency(df)

(397.2343394203484,
 1.1703719501263788e-83,
 5,
 array([[ 43.78762414,  56.24751719, 187.96638655,  71.19938885,
          42.36363636,  64.43544691],
        [ 79.21237586, 101.75248281, 340.03361345, 128.80061115,
          76.63636364, 116.56455309]]))

In [12]:
chi2, pvalue, df, expfreq = chi2_contingency(df)

In [13]:
pvalue

1.1703719501263788e-83

In [14]:
# Reject NULL
# Variables are associated

## Ice cream 

In [15]:
icecream = pd.read_csv('Ice_cream.csv')
icecream

Unnamed: 0,Gender,Flavor
0,Male,Chocolate
1,Female,Vanilla
2,Male,Strawberry
3,Male,Strawberry
4,Male,Vanilla
...,...,...
195,Female,Vanilla
196,Female,Vanilla
197,Female,Vanilla
198,Female,Vanilla


In [16]:
df = pd.crosstab(icecream.Gender, icecream.Flavor)

In [17]:
df

Flavor,Chocolate,Strawberry,Vanilla
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,32,29,48
Male,15,29,47


In [18]:
chi2, pvalue, df, expfreq = chi2_contingency(df)

In [19]:
pvalue

0.10144219001975929

In [20]:
# Not enough proof to reject NULL
# Variables are not associated

In [21]:
# The End!