In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate

import warnings
warnings.filterwarnings('ignore') # Code for stopping warnings (deprecation warning, etc.)

pd.set_option('display.max_columns', None) # Code for showing all columns in the dateset, withoud '...' in between.

In [2]:
people = pd.read_csv('../raw_data/people_clean.csv')
people.head(3)

Unnamed: 0,employee_id,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager
0,1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,4,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0


PROJECT STEPS:

1. Check variable type.
2. Check duplicate values in `employee_id`.

In [3]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 34 columns):
employee_id                   1470 non-null int64
age                           1470 non-null int64
attrition                     1470 non-null object
business_travel               1470 non-null object
daily_rate                    1470 non-null int64
department                    1470 non-null object
distance_from_home            1470 non-null int64
education                     1470 non-null int64
education_field               1470 non-null object
environment_satisfaction      1470 non-null int64
gender                        1470 non-null object
hourly_rate                   1470 non-null int64
job_involvement               1470 non-null int64
job_level                     1470 non-null int64
job_role                      1470 non-null object
job_satisfaction              1470 non-null int64
marital_status                1470 non-null object
monthly_income                1470 n

In [4]:
people['over'].unique() # Variable with a single value. Does not influence the target.

array(['Y'], dtype=object)

Dropping `over` from dataset. 

In [5]:
people = people.drop(['over', 'standard_hours'], axis = 1)

Check duplicate values in `employee_id`.

In [6]:
people['employee_id'].duplicated().sum() # No duplicated entries in employee_id. Data is reliable.

0

Investigating numeric features

In [7]:
semi_numeric = people.select_dtypes(include = np.number).columns.tolist()
semi_numeric

['employee_id',
 'age',
 'daily_rate',
 'distance_from_home',
 'education',
 'environment_satisfaction',
 'hourly_rate',
 'job_involvement',
 'job_level',
 'job_satisfaction',
 'monthly_income',
 'monthly_rate',
 'num_companies_worked',
 'percent_salary_hike',
 'performance_rating',
 'relationship_satisfaction',
 'stock_option_level',
 'total_working_years',
 'training_times_last_year',
 'work_life_balance',
 'years_at_company',
 'years_in_current_role',
 'years_since_last_promotion',
 'years_with_curr_manager']

In [8]:
non_numeric = [
     'employee_id',
     'education',
     'environment_satisfaction',
     'job_involvement',
     'job_level',
     'job_satisfaction',
     'performance_rating',
     'relationship_satisfaction',
     'stock_option_level',
     'work_life_balance'
]

numeric_features = list((Counter(semi_numeric) - Counter(non_numeric)).elements())
numeric_features

['age',
 'daily_rate',
 'distance_from_home',
 'hourly_rate',
 'monthly_income',
 'monthly_rate',
 'num_companies_worked',
 'percent_salary_hike',
 'total_working_years',
 'training_times_last_year',
 'years_at_company',
 'years_in_current_role',
 'years_since_last_promotion',
 'years_with_curr_manager']

In [9]:
# plt.figure(figsize = (10, 10))
# for i, feature in enumerate(numeric_features):
# #     for i in plt.subplot(2, 2, i + 1):
#     f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, 
#                                         gridspec_kw={"height_ratios": (.15, .85)})

#     sns.boxplot(people[feature], ax=ax_box)
#     sns.distplot(people[feature], ax=ax_hist)

#     ax_box.set(yticks=[])
#     sns.despine(ax=ax_hist)
#     sns.despine(ax=ax_box, left=True);

In [10]:
(people['age'] <= people['total_working_years']).sum() # Checked data

0

In [11]:
(people['total_working_years'] < people['years_at_company']).sum() # Checked data

0

In [12]:
(people['years_in_current_role'] > people['total_working_years']).sum()

0

In [13]:
(people['years_in_current_role'] > people['years_at_company']).sum()

0

In [14]:
people.columns.tolist()

['employee_id',
 'age',
 'attrition',
 'business_travel',
 'daily_rate',
 'department',
 'distance_from_home',
 'education',
 'education_field',
 'environment_satisfaction',
 'gender',
 'hourly_rate',
 'job_involvement',
 'job_level',
 'job_role',
 'job_satisfaction',
 'marital_status',
 'monthly_income',
 'monthly_rate',
 'num_companies_worked',
 'over_time',
 'percent_salary_hike',
 'performance_rating',
 'relationship_satisfaction',
 'stock_option_level',
 'total_working_years',
 'training_times_last_year',
 'work_life_balance',
 'years_at_company',
 'years_in_current_role',
 'years_since_last_promotion',
 'years_with_curr_manager']

In [15]:
to_robust_scale = [
 'monthly_income',
 'num_companies_worked',
 'total_working_years',
 'training_times_last_year',
 'years_at_company',
 'years_in_current_role',
 'years_since_last_promotion',
 'years_with_curr_manager'
]

In [16]:
people['attrition'] = people['attrition'].map({
                                    'Yes': 1,
                                    'No': 0
                                })

In [17]:
people['over_time'] = people['over_time'].map({
                                    'Yes': 1,
                                    'No': 0
                                })

In [18]:
people['gender'] = people['gender'].map({'Female': 1,
                                         'Male': 0})

In [19]:
people_train, people_test = train_test_split(people, test_size = .3, random_state = 0)

In [21]:
# Robust Scaling
rob_scaler = RobustScaler()
people_train[numeric_features] = rob_scaler.fit_transform(people_train[numeric_features])

In [22]:
# One hot encoding categorical features
to_one_hot = [
    'business_travel',
    'department', 
    'education_field',
    'job_role',
    'marital_status'
]

In [23]:
people_train = pd.get_dummies(people_train, columns = to_one_hot)

In [24]:
people_train.to_csv('../raw_data/people_train.csv', header = True, index = False)
people_test.to_csv('../raw_data/people_test.csv', header = True, index = False)

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

log_model.fit(X_train, y_train)

In [None]:
log_model.score(X_val, y_val)