In [1]:
import pandas as pd 
from scipy.stats import chi2_contingency, mannwhitneyu, ttest_ind

In [2]:
dset = pd.read_csv('general_data.csv')
dset.Attrition = dset.Attrition.replace(['Yes','No'],[0,1])

In [3]:
dset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,1,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,0,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,1,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,1,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,1,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


## Non-Parametric test

# Chi_sqr test 
# bet categorical variable 'attrition' and 'Gender' below:
# H0 :There is no significant relation between male and female with Attrition rates
# Ha : There is a significant relation.

In [4]:
chi_table = pd.crosstab(dset.Attrition,dset.Gender)

In [5]:
chi_table

Gender,Female,Male
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
0,270,441
1,1494,2205


In [6]:
stat,p,dof,expected = chi2_contingency(chi_table)
print(stat,p)

1.349904410246582 0.24529482862926827


## Inference:
Since the p > .05, we accept the null hypothesis(H0), which says both male and female Attrition rate s are almost equal.

# bet categorical variable 'attrition' and 'Department' below:
## H0 :There is no significant relation with Attrition rates between employess of diff department.
## Ha : There is a significant relation.

In [7]:
chi_table = pd.crosstab(dset.Attrition,dset.Department)

In [8]:
stat,p,dof,expected = chi2_contingency(chi_table)
print(stat,p)

29.090274924488263 4.820888218170407e-07


In [9]:
chi_table

Department,Human Resources,Research & Development,Sales
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,57,453,201
1,132,2430,1137


## Inference:
Since the p < .05, we reject the null hypothesis(H0), and we can see from the chi_table that the attrition rate id different in different department.

# Below we will split the dataset in two indepent dataset based on attrition values 0 or 1

In [10]:
a = dset.groupby('Attrition')

In [11]:
Attr_Yes = a.get_group(1)
Attr_No = a.get_group(0)

# Mann whitney test :
# 1. Monthly income                                                                                                               
H0: There is no significant difference between monthly Income between the Attr_yes and Attr_No.                                  
Ha: There is a significant difference between the two groups

In [12]:
stat,p = mannwhitneyu(Attr_Yes.MonthlyIncome,Attr_No.MonthlyIncome)
print(stat,p)

1264900.5 0.053577283839938566


# Inference:
Since the p > .05, we accept the null hypothesis(H0), So there is no significant difference between the two group when it comes to monthly income

# 2.Years At company                                                                                                                
H0: There is no significant difference between the Attr_yes and Attr_No.                                  
Ha: There is a significant difference between the two groups

In [13]:
stat,p = mannwhitneyu(Attr_Yes.YearsAtCompany,Attr_No.YearsAtCompany)
print(stat,p)

923238.0 6.047598261693028e-37


# Inference:
Since the p<.05, we reject the null hypothesis(H0), So there is significant difference between the two group when it comes
to Years at the company

# 2.Total working years                                                                                                                
H0: There is no significant difference between the Attr_yes and Attr_No.                                  
Ha: There is a significant difference between the two groups

In [14]:

stat,p = mannwhitneyu(Attr_Yes.TotalWorkingYears,Attr_No.TotalWorkingYears)
print(stat,p)

909958.0 2.894653260702919e-39


# Inference:
Since the p<.05, we reject the null hypothesis(H0), So there is significant difference between the two group when it comes
to total working years.

## Parametric test
# 2 sample Independent test :
1. # 1.Distance from home                                                                                                                
H0: There is no significant difference in the mean of Distance from home between the Attr_yes and Attr_No.                                  
Ha: There is a significant difference between the two groups

In [16]:
stat,p = ttest_ind(Attr_Yes.DistanceFromHome,Attr_No.DistanceFromHome)
print(stat,p)

0.6460416038042738 0.518286042805572


# Inference:
Since the p>.05, we accept the null hypothesis(H0), So there no significant difference in the mean values between the two group.

1. # 2.Percent salary hike                                                                                                               
H0: There is no significant difference in the mean Percent salary hike between the Attr_yes and Attr_No.                                  
Ha: There is a significant difference between the two groups

In [17]:
stat,p = ttest_ind(Attr_Yes.PercentSalaryHike,Attr_No.PercentSalaryHike)
print(stat,p)

-2.1610730224641777 0.03074338643339195


# Inference:
Since the p<.05, we reject the null hypothesis(H0), So there is a significant difference in the mean values between the two group' Percent salary hike.