## Launching Data

In [1]:
import pandas as pd
from scipy.stats import pointbiserialr

In [2]:
df = pd.read_csv("general_data.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
df["Attrition"].replace(to_replace=("No","Yes"),value=(0,1),inplace=True)

In [4]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,1,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,0,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,0,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,0,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


## Correlation Matrix

In [24]:

df.corr()

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EmployeeCount,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.159205,0.006963,-0.035706,,0.008649,-0.002884,-0.044314,0.299243,-0.033137,,-0.031753,0.680661,-0.027308,0.311309,0.216513,0.202089
Attrition,-0.159205,1.0,-0.00973,-0.015111,,-0.004729,-0.01029,-0.031176,0.042345,0.032533,,-0.006839,-0.170338,-0.049431,-0.134392,-0.033019,-0.156199
DistanceFromHome,0.006963,-0.00973,1.0,-0.008638,,-0.001097,-0.037329,-0.021607,-0.013843,0.038125,,0.011169,0.009374,-0.009001,0.031684,0.00229,0.021584
Education,-0.035706,-0.015111,-0.008638,1.0,,-0.00968,0.045746,0.00641,-0.01625,-0.040531,,0.001261,-0.010717,0.010472,0.00608,0.02249,0.005358
EmployeeCount,,,,,,,,,,,,,,,,,
EmployeeID,0.008649,-0.004729,-0.001097,-0.00968,,1.0,-0.003303,0.007338,-0.001117,-0.004456,,-0.014254,-0.001063,-0.010191,0.004086,0.000256,0.008579
JobLevel,-0.002884,-0.01029,-0.037329,0.045746,,-0.003303,1.0,0.047316,-0.009875,0.010973,,0.000993,-0.036957,-0.0325,-0.064219,-0.060811,-0.055251
MonthlyIncome,-0.044314,-0.031176,-0.021607,0.00641,,0.007338,0.047316,1.0,-0.020726,0.004325,,0.02693,-0.033787,0.050112,0.000995,0.065219,0.024304
NumCompaniesWorked,0.299243,0.042345,-0.013843,-0.01625,,-0.001117,-0.009875,-0.020726,1.0,0.03086,,0.017163,0.238807,-0.032286,-0.117959,-0.036656,-0.109667
PercentSalaryHike,-0.033137,0.032533,0.038125,-0.040531,,-0.004456,0.010973,0.004325,0.03086,1.0,,0.012548,-0.018736,-0.037392,-0.029707,-0.029542,-0.040864


In [5]:
att = df["Attrition"]

## Correlation Between Monthly_income and Attrition

In [7]:
mon_income = df.MonthlyIncome

In [8]:
pbc = pointbiserialr(mon_income, att)

In [9]:
print(f'PointSerial Corelation B/W Monthly_income and Attrition : {pbc[0]} and p_value : {pbc[1]}')

PointSerial Corelation B/W Monthly_income and Attrition : -0.031176281698114025 and p_value : 0.0384274849060192


<ol>
<li>Corelation : Negative Correlation</li>
<li>p_value < 0.05 ,So H0 is Rejected </li>
</ol>

## Correlation Between Distance_from_Home and Attrition

In [10]:
dis_from_home = df.DistanceFromHome

In [11]:
pbc1 = pointbiserialr(dis_from_home, att)

In [12]:
print(f'PointSerial Corelation B/W DistanceFromHome and Attrition : {pbc1[0]} and p_value : {pbc1[1]}')

PointSerial Corelation B/W DistanceFromHome and Attrition : -0.009730141010179438 and p_value : 0.5182860428049617


<ol>
<li>Corelation : Negative Correlation</li>
<li>p_value > 0.05 ,So H0 is not Rejected </li>
</ol>

## Correlation Between Education and Attrition

In [13]:
edu_caution = df.Education

In [14]:
pbc2 = pointbiserialr(edu_caution, att)

In [15]:
print(f'PointSerial Corelation B/W Education and Attrition : {pbc2[0]} and p_value : {pbc2[1]}')

PointSerial Corelation B/W Education and Attrition : -0.015111167710968737 and p_value : 0.3157293177118575


<ol>
<li>Corelation : Negative Correlation</li>
<li>p_value < 0.05 ,So H0 is Rejected </li>
</ol>

## Correlation Between Job_Level and Attrition

In [16]:
job_level = df.JobLevel

In [17]:
pbc3 = pointbiserialr(job_level,att)

In [18]:
print(f'PointSerial Corelation B/W Job_Level and Attrition : {pbc3[0]} and p_value : {pbc3[1]}')

PointSerial Corelation B/W Job_Level and Attrition : -0.010289713287495117 and p_value : 0.49451717271828405


<ol>
<li>Corelation : Negative Correlation</li>
<li>p_value < 0.05 ,So H0 is Rejected </li>
</ol>