# Data Dictionary
**Variable** Meaning	Levels</br>
**Age**	Age of the employee	</br>
**Attrition**	Whether the employee left in the previous year or not	</br>
**BusinessTravel**	How frequently the employees travelled for business purposes in the last year	</br>
**Department**	Department in company	</br>
**DistanceFromHome**	Distance from home in kms	</br>
**Education**	Education Level</br>
        1 'Below College'
		2 'College'
		3 'Bachelor'
		4 'Master'
		5 'Doctor'</br>
**EducationField**	Field of education	</br>
**EmployeeCount**	Employee count	</br>
**EmployeeNumber**	Employee number/id	</br>
**EnvironmentSatisfaction**	Work Environment Satisfaction Level</br>
        1 'Low'
		2 'Medium'
		3 'High'
		4 'Very High'</br>
**Gender**	Gender of employee	</br>
**JobInvolvement**	Job Involvement Level</br>
        1 'Low'
		2 'Medium'
		3 'High'
		4 'Very High'</br>
**JobLevel**	Job level at company on a scale of 1 to 5	</br>
**JobRole**	Name of job role in company	</br>
**JobSatisfaction**	Job Satisfaction Level</br>
        1 'Low'
		2 'Medium'
		3 'High'
		4 'Very High'</br>
**MaritalStatus**	Marital status of the employee	</br>
**MonthlyIncome**	Monthly income in rupees per month	</br>
**NumCompaniesWorked**	Total number of companies the employee has worked for	</br>
**Over18**	Whether the employee is above 18 years of age or not	</br>
**PercentSalaryHike**	Percent salary hike for last year	</br>
**PerformanceRating**	Performance rating for last year	</br>
        1 'Low'
		2 'Good'
		3 'Excellent'
		4 'Outstanding'</br>
**RelationshipSatisfaction**	Relationship satisfaction level</br>
        1 'Low'
		2 'Medium'
		3 'High'
		4 'Very High'</br>
**StandardHours**	Standard hours of work for the employee	</br>
**StockOptionLevel**	Stock option level of the employee	</br>
**TotalWorkingYears**	Total number of years the employee has worked so far	</br>
**TrainingTimesLastYear**	Number of times training was conducted for this employee last year	</br>
**WorkLifeBalance**	Work life balance level	</br>
        1 'Bad'
		2 'Good'
		3 'Better'
		4 'Best'</br>
**YearsAtCompany**	Total number of years spent at the company by the employee	
**YearsSinceLastPromotion**	Number of years since last promotion	
**YearsWithCurrManager**	Number of years under current manager	

In [1]:
# Common imports
import sqlite3
import pandas as pd
import numpy as np
import os
import seaborn as sns
sns.set()
# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

In [3]:
conn = sqlite3.connect("data/hr_data.db")
#employee_survey = pd.read_sql_query("select * from employee_survey_data", conn)
#general_data = pd.read_sql_query("select * from general_data", conn)
#manager_survey = pd.read_sql_query("select * from manager_survey_data", conn)
#in_time = pd.read_sql_query("select * from in_time", conn)
#out_time = pd.read_sql_query("select * from out_time", conn)

all_data = pd.read_sql_query("select *, Case when Attrition ='Yes' then 1 else 0 end as 'Attr' from general_data g join employee_survey_data e on g.EmployeeID=e.EmployeeID join manager_survey_data m on g.EmployeeID=m.EmployeeID Where EnvironmentSatisfaction Not Like 'NA' AND JobSatisfaction Not Like 'NA' AND WorkLifeBalance Not Like 'NA' AND TotalWorkingYears Not Like 'NA' AND NumCompaniesWorked Not Like 'NA'", conn)
#removed_data = pd.read_sql_query("select * from general_data g join employee_survey_data e on g.EmployeeID=e.EmployeeID join manager_survey_data m on g.EmployeeID=m.EmployeeID Where EnvironmentSatisfaction Like 'NA' or JobSatisfaction Like 'NA' or WorkLifeBalance Like 'NA' or TotalWorkingYears Like 'NA' or NumCompaniesWorked Like 'NA' ", conn)


In [4]:
all_data['JobSatisfaction'] = all_data['JobSatisfaction'].astype(np.int64)
all_data['WorkLifeBalance'] = all_data['WorkLifeBalance'].astype(np.int64)
all_data['EnvironmentSatisfaction'] = all_data['EnvironmentSatisfaction'].astype(np.int64)

In [5]:
#drop the useless columns:
all_data.drop(['EmployeeCount','EmployeeID','StandardHours'],axis=1, inplace = True)

In [6]:
all_data.drop_duplicates(keep = 'first',inplace=True) 

## Get Dummies

In [7]:
hr_data_uc = all_data[['Attr','Age','DistanceFromHome','Education', 
'JobLevel',
'MonthlyIncome',
'NumCompaniesWorked',
'PercentSalaryHike',
'StockOptionLevel',
'TotalWorkingYears',
'TrainingTimesLastYear',
'YearsAtCompany',
'YearsSinceLastPromotion',
'YearsWithCurrManager',
'EnvironmentSatisfaction',
'JobSatisfaction',
'WorkLifeBalance',
'JobInvolvement',
'PerformanceRating'   ]].copy()
hr_data_uc.head()

Unnamed: 0,Attr,Age,DistanceFromHome,Education,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating
0,0,51,6,2,1,131160,1,11,0,1,6,1,0,0,3,4,2,3,3
1,1,31,10,1,1,41890,0,23,1,6,3,5,1,4,3,2,4,2,4
2,0,32,17,4,4,193280,1,15,3,5,2,5,0,3,2,2,1,3,3
3,0,38,2,5,3,83210,3,11,3,13,5,8,7,5,4,4,3,2,3
4,0,32,10,1,1,23420,4,12,2,9,2,6,0,4,4,1,3,3,3


In [8]:
hr_data_cat = all_data[['Attrition', 'BusinessTravel','Department',
                       'EducationField','Gender','JobRole',
                       'MaritalStatus']].copy()
hr_data_cat.head()


Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus
0,No,Travel_Rarely,Sales,Life Sciences,Female,Healthcare Representative,Married
1,Yes,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Single
2,No,Travel_Frequently,Research & Development,Other,Male,Sales Executive,Married
3,No,Non-Travel,Research & Development,Life Sciences,Male,Human Resources,Married
4,No,Travel_Rarely,Research & Development,Medical,Male,Sales Executive,Single


In [9]:
Num_val = {'Yes':1, 'No':0}
hr_data_cat['Attrition'] = hr_data_cat["Attrition"].apply(lambda x: Num_val[x])
hr_data_cat.head()

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus
0,0,Travel_Rarely,Sales,Life Sciences,Female,Healthcare Representative,Married
1,1,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Single
2,0,Travel_Frequently,Research & Development,Other,Male,Sales Executive,Married
3,0,Non-Travel,Research & Development,Life Sciences,Male,Human Resources,Married
4,0,Travel_Rarely,Research & Development,Medical,Male,Sales Executive,Single


In [10]:
hr_data_cat = pd.get_dummies(hr_data_cat)
hr_data_cat.head()

Unnamed: 0,Attrition,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,0,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
2,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [11]:
hr_data_final = pd.concat([hr_data_uc, hr_data_cat], axis=1)
hr_data_final.head()

Unnamed: 0,Attr,Age,DistanceFromHome,Education,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,0,51,6,2,1,131160,1,11,0,1,...,0,0,0,0,0,0,0,0,1,0
1,1,31,10,1,1,41890,0,23,1,6,...,0,0,0,0,1,0,0,0,0,1
2,0,32,17,4,4,193280,1,15,3,5,...,0,0,0,0,0,1,0,0,1,0
3,0,38,2,5,3,83210,3,11,3,13,...,0,0,0,0,0,0,0,0,1,0
4,0,32,10,1,1,23420,4,12,2,9,...,0,0,0,0,0,1,0,0,0,1


### Linear Regression Model 1

In [12]:
#First Test
filtered = all_data.drop('BusinessTravel',axis=1) #Object
filtered = filtered.drop('Department',axis=1) #Object
filtered = filtered.drop('Attrition',axis=1) #Object
filtered = filtered.drop('EducationField',axis=1) #Object
filtered = filtered.drop('Gender',axis=1) #Object
filtered = filtered.drop('JobRole',axis=1) #Object
filtered = filtered.drop('MaritalStatus',axis=1) #Object
filtered = filtered.drop('Over18',axis=1) #Object
filtered = filtered.drop('Age',axis=1) # Lowers Accuracy
#filtered = filtered.drop('DistanceFromHome',axis=1) # Lowers Accuracy
filtered = filtered.drop('Education',axis=1) # TEST VALUE
filtered = filtered.drop('MonthlyIncome',axis=1) # TEST VALUE .001
filtered = filtered.drop('NumCompaniesWorked',axis=1) # TEST VALUE
filtered = filtered.drop('PercentSalaryHike',axis=1) # TEST VALUE
#filtered = filtered.drop('TotalWorkingYears',axis=1) # TEST VALUE
#filtered = filtered.drop('YearsWithCurrManager',axis=1) # does not help
filtered = filtered.drop('YearsSinceLastPromotion',axis=1) # Remove improves accuracy
filtered = filtered.drop('YearsAtCompany',axis=1) # Minimal change when used with YearsSinceLastPromotion
#filtered = filtered.drop('TrainingTimesLastYear',axis=1) # Does not help
#filtered = filtered.drop('JobLevel',axis=1) # TEST VALUE
#filtered = filtered.drop('StockOptionLevel',axis=1) # TEST VALUE
#filtered = filtered.drop('EnvironmentSatisfaction',axis=1) # TEST VALUE
#filtered = filtered.drop('JobSatisfaction',axis=1) # TEST VALUE
#filtered = filtered.drop('WorkLifeBalance',axis=1) # TEST VALUE
#filtered = filtered.drop('JobInvolvement',axis=1) # TEST VALUE
#filtered = filtered.drop('PerformanceRating',axis=1) # TEST VALUE
filtered.head()

Unnamed: 0,DistanceFromHome,JobLevel,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating,Attr
0,6,1,0,1,6,0,3,4,2,3,3,0
1,10,1,1,6,3,4,3,2,4,2,4,1
2,17,4,3,5,2,3,2,2,1,3,3,0
3,2,3,3,13,5,5,4,4,3,2,3,0
4,10,1,2,9,2,4,4,1,3,3,3,0


In [13]:
X = filtered.drop("Attr", axis=1)
y = filtered["Attr"]

print(X.shape, y.shape)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

classifier.fit(X_train, y_train)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

(1470, 11) (1470,)
Training Data Score: 0.8393829401088929
Testing Data Score: 0.8369565217391305




## Linear Regression based on correlation 83.9%

In [15]:
#new test

filtered1 = all_data.drop('BusinessTravel',axis=1) #Object
filtered1 = filtered1.drop('Department',axis=1) #Object
filtered1 = filtered1.drop('Attrition',axis=1) #Object
filtered1 = filtered1.drop('EducationField',axis=1) #Object
filtered1 = filtered1.drop('Gender',axis=1) #Object
filtered1 = filtered1.drop('JobRole',axis=1) #Object
filtered1 = filtered1.drop('MaritalStatus',axis=1) #Object
filtered1 = filtered1.drop('Over18',axis=1) #Object
filtered1 = filtered1.drop('Age',axis=1) # Lowers Accuracy
filtered1 = filtered1.drop('DistanceFromHome',axis=1) # Lowers Accuracy
filtered1 = filtered1.drop('Education',axis=1) # TEST VALUE
filtered1 = filtered1.drop('MonthlyIncome',axis=1) # TEST VALUE .001
#filtered1 = filtered1.drop('NumCompaniesWorked',axis=1) # TEST VALUE
filtered1 = filtered1.drop('PercentSalaryHike',axis=1) # TEST VALUE
#filtered1 = filtered1.drop('TotalWorkingYears',axis=1) # TEST VALUE
#filtered1 = filtered1.drop('YearsWithCurrManager',axis=1) # does not help
#filtered1 = filtered1.drop('YearsSinceLastPromotion',axis=1) # Leaving In table improves accuracy
#filtered1 = filtered1.drop('YearsAtCompany',axis=1) # Minimal improvement leaving in change when used with YearsSinceLastPromotion
#filtered1 = filtered1.drop('TrainingTimesLastYear',axis=1) # Does not help
filtered1 = filtered1.drop('JobLevel',axis=1) # TEST VALUE
filtered1 = filtered1.drop('StockOptionLevel',axis=1) # TEST VALUE
#filtered1 = filtered1.drop('EnvironmentSatisfaction',axis=1) # TEST VALUE
#filtered1 = filtered1.drop('JobSatisfaction',axis=1) # TEST VALUE
filtered1 = filtered1.drop('WorkLifeBalance',axis=1) # TEST VALUE
filtered1 = filtered1.drop('JobInvolvement',axis=1) # TEST VALUE
filtered1 = filtered1.drop('PerformanceRating',axis=1) # TEST VALUE

filtered1.head()

Unnamed: 0,NumCompaniesWorked,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,Attr
0,1,1,6,1,0,0,3,4,0
1,0,6,3,5,1,4,3,2,1
2,1,5,2,5,0,3,2,2,0
3,3,13,5,8,7,5,4,4,0
4,4,9,2,6,0,4,4,1,0


In [16]:
X1 = filtered1.drop("Attr", axis=1)
y1 = filtered1["Attr"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=1, stratify=y)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8393829401088929
Testing Data Score: 0.8396739130434783




### Linear Regression Model with All columns using Get Dummies info

In [17]:
filtered2 = hr_data_final
#filtered2 = filtered2.drop('Attr',axis=1) #Object
filtered2 = filtered2.drop('Attrition',axis=1) #Object
filtered2 = filtered2.drop('Age',axis=1) # Lowers Accuracy
#filtered2 = filtered2.drop('DistanceFromHome',axis=1) # Lowers Accuracy
filtered2 = filtered2.drop('Education',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('MonthlyIncome',axis=1) # TEST VALUE .001
filtered2 = filtered2.drop('NumCompaniesWorked',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('PercentSalaryHike',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('TotalWorkingYears',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('YearsWithCurrManager',axis=1) # does not help
#filtered2 = filtered2.drop('YearsSinceLastPromotion',axis=1) # Leaving In table improves accuracy
#filtered2 = filtered2.drop('YearsAtCompany',axis=1) # Minimal improvement leaving in change when used with YearsSinceLastPromotion
#filtered2 = filtered2.drop('TrainingTimesLastYear',axis=1) # Does not help
filtered2 = filtered2.drop('JobLevel',axis=1) # TEST VALUE
filtered2 = filtered2.drop('StockOptionLevel',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('EnvironmentSatisfaction',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('JobSatisfaction',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('WorkLifeBalance',axis=1) # TEST VALUE
filtered2 = filtered2.drop('JobInvolvement',axis=1) # TEST VALUE
filtered2 = filtered2.drop('PerformanceRating',axis=1) # TEST VALUE

filtered2 = filtered2.drop('BusinessTravel_Non-Travel',axis=1) # Minimal improvement leaving in change when used with YearsSinceLastPromotion
filtered2 = filtered2.drop('BusinessTravel_Travel_Frequently',axis=1) # Does not help
filtered2 = filtered2.drop('BusinessTravel_Travel_Rarely',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('Department_Human Resources',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('Department_Research & Development',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('Department_Sales',axis=1) # TEST VALUE
filtered2 = filtered2.drop('EducationField_Human Resources',axis=1) # TEST VALUE
filtered2 = filtered2.drop('EducationField_Life Sciences',axis=1) # TEST VALUE
filtered2 = filtered2.drop('EducationField_Marketing',axis=1) # TEST VALUE

filtered2 = filtered2.drop('EducationField_Medical',axis=1) # Minimal improvement leaving in change when used with YearsSinceLastPromotion
filtered2 = filtered2.drop('EducationField_Other',axis=1) # Does not help
filtered2 = filtered2.drop('EducationField_Technical Degree',axis=1) # TEST VALUE
filtered2 = filtered2.drop('Gender_Female',axis=1) # TEST VALUE
filtered2 = filtered2.drop('Gender_Male',axis=1) # TEST VALUE
filtered2 = filtered2.drop('JobRole_Healthcare Representative',axis=1) # TEST VALUE
filtered2 = filtered2.drop('JobRole_Human Resources',axis=1) # TEST VALUE
filtered2 = filtered2.drop('JobRole_Laboratory Technician',axis=1) # TEST VALUE
filtered2 = filtered2.drop('JobRole_Manager',axis=1) # TEST VALUE

filtered2 = filtered2.drop('JobRole_Manufacturing Director',axis=1) # Minimal improvement leaving in change when used with YearsSinceLastPromotion
filtered2 = filtered2.drop('JobRole_Research Director',axis=1) # Does not help
filtered2 = filtered2.drop('JobRole_Research Scientist',axis=1) # TEST VALUE
filtered2 = filtered2.drop('JobRole_Sales Executive',axis=1) # TEST VALUE
filtered2 = filtered2.drop('JobRole_Sales Representative',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('MaritalStatus_Divorced',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('MaritalStatus_Married',axis=1) # TEST VALUE
#filtered2 = filtered2.drop('MaritalStatus_Single',axis=1) # TEST VALUE


In [19]:
X2 = filtered2.drop("Attr", axis=1)
y2 = filtered2["Attr"]

from sklearn.model_selection import train_test_split

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=1, stratify=y)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

classifier.fit(X2_train, y2_train)
print(f"Training Data Score: {classifier.score(X2_train, y2_train)}")
print(f"Testing Data Score: {classifier.score(X2_test, y2_test)}")

Training Data Score: 0.838475499092559
Testing Data Score: 0.8396739130434783


