In [1]:
import pandas as pd
from scipy import stats
from pydataset import data

mpg = data('mpg')
mpg['transmission'] = mpg.trans.str[:-4] # a little cleaning goes a long way
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,transmission
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,auto
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,manual
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,manual
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,auto
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,auto


In [2]:
n = mpg.shape[0]

transmission_proportions = mpg.transmission.value_counts() / n
transmission_proportions


auto      0.67094
manual    0.32906
Name: transmission, dtype: float64

In [4]:
drive_proportions = mpg.drv.value_counts() / n
drive_proportions


f    0.452991
4    0.440171
r    0.106838
Name: drv, dtype: float64

In [5]:
expected = pd.DataFrame()

for transmission_group, t_prop in transmission_proportions.iteritems():
    for drive_group, d_prop in drive_proportions.iteritems():
        expected.loc[drive_group, transmission_group] = t_prop * d_prop

expected.sort_index(inplace=True)
expected


Unnamed: 0,auto,manual
4,0.295328,0.144843
f,0.30393,0.149061
r,0.071682,0.035156


In [6]:
expected *= n
expected


Unnamed: 0,auto,manual
4,69.106838,33.893162
f,71.119658,34.880342
r,16.773504,8.226496


In [7]:
observed = pd.crosstab(mpg.drv, mpg.transmission)
observed


transmission,auto,manual
drv,Unnamed: 1_level_1,Unnamed: 2_level_1
4,75,28
f,65,41
r,17,8


In [8]:
chi2 = ((observed - expected)**2 / expected).values.sum()
chi2


3.136769245971112

In [9]:
nrows, ncols = observed.shape

degrees_of_freedom = (nrows - 1) * (ncols - 1)


In [10]:
stats.chi2(degrees_of_freedom).sf(chi2)


0.20838152534979645

In [11]:
index = ['Churn', 'No Churn']
columns = ['Product A', 'Product B']

observed = pd.DataFrame([[100, 50], [120, 28]], index=index, columns=columns)
n = observed.values.sum()

expected = pd.DataFrame([[.372, .132], [.367, .130]], index=index, columns=columns) * n

chi2 = ((observed - expected)**2 / expected).values.sum()

nrows, ncols = observed.shape

degrees_of_freedom = (nrows - 1) * (ncols - 1)

p = stats.chi2(degrees_of_freedom).sf(chi2)

print('Observed')
print(observed)
print('---\nExpected')
print(expected)
print('---\n')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')


Observed
          Product A  Product B
Churn           100         50
No Churn        120         28
---
Expected
          Product A  Product B
Churn       110.856     39.336
No Churn    109.366     38.740
---

chi^2 = 7.9656
p     = 0.0048


In [12]:
observed = pd.crosstab(mpg.drv, mpg.transmission)

In [14]:
### THE EASY WAY

In [13]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

print('Observed\n')
print(observed.values)
print('---\nExpected\n')
print(expected)
print('---\n')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')


Observed

[[75 28]
 [65 41]
 [17  8]]
---
Expected

[[69.10683761 33.89316239]
 [71.11965812 34.88034188]
 [16.77350427  8.22649573]]
---

chi^2 = 3.1368
p     = 0.2084


# CHI SQUARED TEST FOR INDEPEENDENCE
- WE'LL FEED OUR CHI2 FUNCTION TWO SERIES
- OUTPUT FROM CHI2 FUNCTION THAT WE CARE ABOUT IS THE 'P' VALUE

##PROCESS
1. STATE OUR NULL HYPOTHESIS. THE CHI2 NULL HYPOTHESIS IS:
        - THERE IS NO RELATIONSHIP BETWEEN A AND B
        - A AND B CATEGORIES ARE INDEPENDENT.
        
2. RUN A 'OBSERVED = PD.CROSSTAB(DF.A, DF.B)'

3. COMPARE YOUR P TO A. IF P < A THEN E REJECT THE NULL, E HAVE EVIDENCE SUPPORTING THE ALTERNATIVE HYPOTHESIS.

In [15]:
import pandas as pd
from scipy import stats
from pydataset import data

df = data("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [16]:
# Let's investigate smoking status and day
# The null hypothesis is that they are independent

# Step 1, make a crosstab of the two values we care about investingating

observed = pd.crosstab(df.smoker, df.time)
observed

time,Dinner,Lunch
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
No,106,45
Yes,70,23


In [18]:
alpha = 0.5

In [19]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
p

0.4771485672079724

In [20]:
### what about gender and day?

In [21]:
# Step 1 : set your alpha and define your null hypothesis
# null = gender and day are independent

In [22]:
# Step 2: calculate the observed values with a crosstab
observed = pd.crosstab(df.sex, df.day)
observed

day,Fri,Sat,Sun,Thur
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,9,28,18,32
Male,10,59,58,30


In [23]:
# Step 3, run Chi2_contingency to get the p value
chi2, p, degf, expected = stats.chi2_contingency(observed)
p

0.004180302092822257

### What about time of day and which day?
    - null hypothesis: time and day are independent

In [25]:
observed = pd.crosstab(df.time, df.day)
observed

day,Fri,Sat,Sun,Thur
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,12,87,76,1
Lunch,7,0,0,61


In [26]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
p

8.449897551777147e-47