# Correlation analysis on Atrittion dataset

In [41]:
# Load the data set

import pandas as pd
from scipy.stats import pearsonr

dataset = pd.read_csv("general_data.csv")

In [42]:
#print the first 5 rows
dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


Convert the columns Attrition to numeric to compute the correlation

In [43]:
dataset.Attrition = dataset.Attrition.replace(to_replace = "Yes",value = 1)
dataset.Attrition = dataset.Attrition.replace(to_replace = "No", value = 0)

Test the change by checking the unique values in Attrition column

In [44]:
dataset['Attrition'].unique()

array([0, 1], dtype=int64)

Check the chnage in column datatype

In [45]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   int64  
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

Combination 1 : Attrition  - Age

In [46]:
stat, p = pearsonr(dataset.Attrition, dataset.Age)
print("Correlation :",stat, "\nProbability :",p)

Correlation : -0.159205006865775 
Probability : 1.996801615893625e-26


Combination 2 : Attrition - DistanceFromHome

In [47]:
stat, p = pearsonr(dataset.Attrition, dataset.DistanceFromHome)
print("Correlation :",stat, "\nProbability :",p)

Correlation : -0.009730141010179435 
Probability : 0.5182860428049617


Combination 3 : Attrition - MonthlyIncome

In [48]:
stat, p = pearsonr(dataset.Attrition, dataset.MonthlyIncome)
print("Correlation :",stat, "\nProbability :",p)

Correlation : -0.031176281698114025 
Probability : 0.0384274849060192


Combination 4 : Attrition - NumCompaniesWorked [Num Companies worked contains NaN values which need to be filled]

In [49]:
dataset.NumCompaniesWorked = dataset.NumCompaniesWorked.fillna(0)

In [50]:
dataset.NumCompaniesWorked.unique()

array([1., 0., 3., 4., 2., 7., 9., 5., 6., 8.])

In [51]:
stat, p = pearsonr(dataset.Attrition, dataset.NumCompaniesWorked)
print("Correlation :",stat, "\nProbability :",p)

Correlation : 0.0415033297119007 
Probability : 0.00584142424757897


Combination 5 : Attrition - PercentSalaryHike

In [52]:
stat, p = pearsonr(dataset.Attrition, dataset.PercentSalaryHike)
print("Correlation :",stat, "\nProbability :",p)

Correlation : 0.03253259489105223 
Probability : 0.030743386433369824


Combination 6 : Attrition - StockOptionLevel

In [53]:
stat, p = pearsonr(dataset.Attrition, dataset.StockOptionLevel)
print("Correlation :",stat, "\nProbability :",p)

Correlation : -0.006838852403261037 
Probability : 0.6498072937477383


Combination 7 : Attrition - TotalWorkingYears[Fill null values with 0]

In [55]:
dataset.TotalWorkingYears = dataset.TotalWorkingYears.fillna(0)

In [56]:
dataset.TotalWorkingYears.unique()

array([ 1.,  6.,  5., 13.,  9., 28., 10., 21., 16., 37.,  7.,  3., 15.,
        8.,  0., 12., 17., 19., 22.,  2.,  4., 23., 11., 24., 25., 20.,
       14., 26., 18., 30., 36., 31., 33., 32., 34., 40., 29., 35., 27.,
       38.])

In [59]:
stat, p = pearsonr(dataset.Attrition, dataset.TotalWorkingYears)
print("Correlation :",stat, "\nProbability :",p)

Correlation : -0.17023794049181762 
Probability : 4.959687383358696e-30


Combination 8 : Attrition - YearsAtCompany

In [61]:
stat, p = pearsonr(dataset.Attrition, dataset.YearsAtCompany)
print("Correlation :",stat, "\nProbability :",p)

Correlation : -0.13439221398997386 
Probability : 3.163883122493571e-19


Combination 9 : Attrition - YearsSinceLastPromotion

In [62]:
stat, p = pearsonr(dataset.Attrition, dataset.YearsSinceLastPromotion)
print("Correlation :",stat, "\nProbability :",p)

Correlation : -0.03301877514258329 
Probability : 0.02833033618939086


Combination 9 : Attrition - YearsWithCurrManager

In [63]:
stat, p = pearsonr(dataset.Attrition, dataset.YearsWithCurrManager)
print("Correlation :",stat, "\nProbability :",p)

Correlation : -0.1561993159016244 
Probability : 1.7339322652951965e-25
