In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pydataset import data
from scipy import stats

# Chi$^2$ ($\chi^2$) Test for Independence

aka Pearson's Chi$^2$ test. Pronounced as 'Ki' as in kite.

- Lets us test the hypothesis that one group is independent of another
- $H_0$ is always that there is independence between the groups
- $H_0$ is that there is no dependence


The null hypothesis assumes that the observed frequencies for a categorical variable match the expected frequencies for the categorical variable

## The Quick Way To Run a Chi$^2$ Test

In [2]:
# get data from pydataset
df = data('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


- $H_0$ There is independence between the smoker and time of the
day
- $H_a$ is that there is a dependence

In [6]:
#make a crosstab of the variables that you want to compare
observed = pd.crosstab(df.time, df.smoker, margins = True) #margins provides the totals of rowas and columns
observed

smoker,No,Yes,All
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,106,70,176
Lunch,45,23,68
All,151,93,244


In [10]:
# pandas crosstab to make a 'contingency' table
observed = pd.crosstab(df.time, df.smoker, margins = True, normalize = 'columns')
observed

smoker,No,Yes,All
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,0.701987,0.752688,0.721311
Lunch,0.298013,0.247312,0.278689


In [14]:
# Set our alpha
alpha = .01

In [12]:
# chi2_contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)
chi2, p, degf, expected

(0.00657303303193494,
 0.9967188781679133,
 2,
 array([[0.7253288, 0.7253288, 0.7253288],
        [0.2746712, 0.2746712, 0.2746712]]))

In [16]:
if p < alpha:
    print("we reject the null")
else:
    print("we fail to reject the null")

we fail to reject the null


## Attrition Data

In [17]:
# get your data
df = pd.read_csv("https://gist.githubusercontent.com/ryanorsinger/6ba2dd985c9aa92f5598fc0f7c359f6a/raw/b20a508cee46e6ac69eb1e228b167d6f42d665d8/attrition.csv")

In [18]:
df.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0


In [19]:
# check shape of the dataframe
df.shape

(1470, 35)

In [20]:
# Check for which columns are discrete
df.nunique()

Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeCount                  1
EmployeeNumber              1470
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBa

In [None]:
# Question we want to answer:

# 1. Is Attrition independent from Business Travel amount?
# 2. Is Attrition independent from Department?
# 3. Is Attrition indpendent from WorkLife balance

In [21]:
df.BusinessTravel.value_counts()

Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: BusinessTravel, dtype: int64

Form hypothesis:

$H_0$: Attrition and Travel Frequency are independent (not dependent)

$H_a$: Attrition and Travel Frequency are dependent

In [22]:
# cross tab Attrition vs Business Travel
observed = pd.crosstab(df.Attrition, df.BusinessTravel)

In [23]:
observed

BusinessTravel,Non-Travel,Travel_Frequently,Travel_Rarely
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,138,208,887
Yes,12,69,156


In [24]:
# Set our alpha
alpha = .01

In [25]:
# .chi2_contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)

In [26]:
chi2, p, degf

(24.182413685655174, 5.608614476449931e-06, 2)

In [27]:
expected 

array([[125.81632653, 232.34081633, 874.84285714],
       [ 24.18367347,  44.65918367, 168.15714286]])

In [28]:
null_hypothesis = "Attrition and Business Travel are independent"

if p < alpha:
    print("We reject the null hypothesis")
    print("We reject the hypothesis that", null_hypothesis)
else:
    print("We fail to reject the null hypothesis")

print(p)

We reject the null hypothesis
We reject the hypothesis that Attrition and Business Travel are independent
5.608614476449931e-06


In [30]:
#Normalized crosstab
observed = pd.crosstab(df.Attrition, df.BusinessTravel, normalize = 'columns')

In [32]:
# make a heatmap
sns.heatmap.observed

AttributeError: 'function' object has no attribute 'observed'

## Let's Test for Independence of Attrition and Deparment
- $H_0$: There is no relationship between them, Attrition and Deparment are independent
- $H_a$: There is a relationship

In [33]:
df.Department.value_counts

<bound method IndexOpsMixin.value_counts of 0                        Sales
1       Research & Development
2       Research & Development
3       Research & Development
4       Research & Development
                 ...          
1465    Research & Development
1466    Research & Development
1467    Research & Development
1468                     Sales
1469    Research & Development
Name: Department, Length: 1470, dtype: object>

In [35]:
#crosstab for observed values between Attrition and Depts
observed = pd.crosstab(df.Attrition, df.Department)
observed

Department,Human Resources,Research & Development,Sales
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,51,828,354
Yes,12,133,92


In [38]:
# Let's get the p value from a chi2 test for independence
chi2, p, degf, expected = stats.chi2_contingency(observed)
chi2, p, degf, expected

(10.79600732241067,
 0.004525606574479633,
 2,
 array([[ 52.84285714, 806.06326531, 374.09387755],
        [ 10.15714286, 154.93673469,  71.90612245]]))

In [37]:
if p < alpha:
    print("We reject the null")
else:
    print("We fail to reject the null")



We reject the null


In [None]:
# Is attrition and being in sales related?

In [39]:
df["in_sales"] = df.Department == "Sales"

In [40]:
df.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,in_sales
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,80,0,8,0,1,6,4,0,5,True
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,80,1,10,3,3,10,7,1,7,False


In [43]:
# crosstab between Attrition and in_sales column
observed = pd.crosstab(df.Attrition, df.in_sales)
observed

in_sales,False,True
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
No,879,354
Yes,145,92


#### $H_0$: Attrition and Being in Sales or Not are independent
#### $H_a$: There is a relationship

In [44]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null")
else:
    print("We fail to reject the null")

p

We reject the null


0.0025036788527795267

## Let's test for independence between WorkLifeBalance and Attrition
- $H_0$: WorkLifeBalance and Attrition are independent, no relationship
- $H_a$: They are dependent - there is a relationship

WorkLifeBalance   
1 'Bad'  
2 'Good'  
3 'Better'  
4 'Best'  

In [45]:
#look at value counts
df.WorkLifeBalance.value_counts()

3    893
2    344
4    153
1     80
Name: WorkLifeBalance, dtype: int64

In [47]:
# Crosstab for Attrition and WorklifeBalance
observed = pd.crosstab(df.Attrition, df.WorkLifeBalance)
observed

WorkLifeBalance,1,2,3,4
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,55,286,766,126
Yes,25,58,127,27


In [52]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
print("Chi: ", chi2, "p-value: ", p, "DOF: ", degf, "expected: ", expected)
if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")

Chi:  16.3250970916474 p-value:  0.0009725698845348824 DOF:  3 expected:  [[ 67.10204082 288.53877551 749.02653061 128.33265306]
 [ 12.89795918  55.46122449 143.97346939  24.66734694]]
We reject the null hypothesis


In [48]:
p

0.0025036788527795267

In [None]:
# Now, let's control for Department

In [53]:
# df.Department.value_counts()
df.Department.value_counts()

Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64

In [54]:
# make new dataframes for each dept
rnd = df[df.Department == 'Research & Development']
sales = df[df.Department == 'Sales']
human = df[df.Department == 'Human Resources']

In [55]:
# Run the chi squared test for independence on only RND
observed = pd.crosstab(rnd.Attrition, rnd.WorkLifeBalance)
observed

WorkLifeBalance,1,2,3,4
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,41,203,507,77
Yes,19,32,68,14


In [56]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")
p

We reject the null hypothesis


0.0004119601633396577

In [58]:
# how about for sales?
observed = pd.crosstab(sales.Attrition, sales.WorkLifeBalance)
observed

WorkLifeBalance,1,2,3,4
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,10,78,226,40
Yes,6,24,50,12


In [59]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")
p

We fail to reject the null


0.20695513054029363

In [61]:
# How about for HR?
observed = pd.crosstab(human.Attrition, human.WorkLifeBalance)
observed

WorkLifeBalance,1,2,3,4
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,4,5,33,9
Yes,0,2,9,1


In [62]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null")
p

We fail to reject the null


0.5645088718158451

## Findings So Far:
$H_0$ is that there is no relationship. Worklife and Attrition are indpendent

- Research and Development, we reject the null hypothesis
- Sales, we fail to reject the null. This could be due to small population size.
- HR, we fail to reject the null. This could be due to small population size

## A very small chi square test statistic means that your observed data fits your expected data extremely well. In other words, there is a relationship.

## A very large chi square test statistic means that the data does not fit very well. In other words, there isn’t a relationship.

In [None]:
# If you're 8 feet away from a door and with each move you advance half the distance to the door. 
# How many moves will it take to reach the door?

In [84]:
def reach_door(x):
    if x < 0:
        print("you don't want to move further from the door, try positive number")
    elif x > 0:
        print("the door will always be half the distance away...")
    return "You are ", x * (1/2), "ft away from the door..."

In [85]:
reach_door(.125)

the door will always be half the distance away...


('You are ', 0.0625, 'ft away from the door...')