In [1]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, \
                                          classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from scipy.stats import loguniform
import re 

# Preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# Algorithms
from sklearn.linear_model import LogisticRegression

# Dealing with warnings
import warnings
warnings.filterwarnings('ignore')

# Setting DataFrame's to show 100 max columns, instead of compressing then
pd.set_option('display.max_columns', 100)

In [2]:
people = pd.read_csv('../raw_data/people_analytics.csv')

In [3]:
people.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [4]:
def camel_case_split(str): 
    return '_'.join(re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', str))

new_columns = []
for str in people.columns.tolist():
    new_columns.append(camel_case_split(str).lower())
    
people.columns = new_columns

In [5]:
people.head()

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_count,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [6]:
people.drop(columns = ['employee_count', 'over', 'standard_hours', 'employee_number'], inplace = True)

In [7]:
print('Dataset shape: ', people.shape)
people.head()

Dataset shape:  (1470, 31)


Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Yes,11,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,No,23,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Yes,15,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Yes,11,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,No,12,3,4,1,6,3,3,2,2,2,2


In [8]:
class IncomeBlwDptJLAvg(BaseEstimator, TransformerMixin):
    '''
    Searches for employee department and job level and classify its monthly income
    as below his/hers department average for his/hers specific job level.
    '''
    
    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self
        
    def transform(self, X, y = None):
        assert isinstance(X, pd.DataFrame)
        
        _minc_gb = X.groupby(['department', 'job_level'])['monthly_income'].median()
        
        departments = X['department'].unique().tolist()
        job_levels = X['job_level'].unique().tolist()
        
        for department in departments:
            for job_level in job_levels:
                X.loc[((X['department'] == department) & \
                       (X['job_level'] == job_level) & \
                       (X['monthly_income'] < _minc_gb[department, job_level])), \
                          'below_median_dpt_joblevel_monthly_income'] = 1

                X.loc[((X['department'] == department) & \
                       (X['job_level'] == job_level) & \
                       (X['monthly_income'] >= _minc_gb[department, job_level])), \
                          'below_median_dpt_joblevel_monthly_income'] = 0
                
                X['below_median_dpt_joblevel_monthly_income'].astype(int)
                
        return X['below_median_dpt_joblevel_monthly_income'].sample(5)

In [12]:
income_blw_avg = IncomeBlwDptJLAvg()
income_blw_avg.fit_transform(people)

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [13]:
class EduFieldJobRole(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        assert isinstance(X, pd.DataFrame)
        
        X['edu_field_job_role'] = X['education_field'] + '_' + X['job_role']
        
        edu_field_job_role_map = { 
         'Human Resources_Human Resources': 0,
         'Human Resources_Manager': 0,
         'Life Sciences_Healthcare Representative': 0,
         'Life Sciences_Human Resources': 1,
         'Life Sciences_Laboratory Technician': 0,
         'Life Sciences_Manager': 1,
         'Life Sciences_Manufacturing Director': 0,
         'Life Sciences_Research Director': 0,
         'Life Sciences_Research Scientist': 0,
         'Life Sciences_Sales Executive': 1,
         'Life Sciences_Sales Representative': 1,
         'Marketing_Manager': 0,
         'Marketing_Sales Executive': 0,
         'Marketing_Sales Representative': 0,
         'Medical_Healthcare Representative': 0,
         'Medical_Human Resources': 1,
         'Medical_Laboratory Technician': 0,
         'Medical_Manager': 1,
         'Medical_Manufacturing Director': 0,
         'Medical_Research Director': 0,
         'Medical_Research Scientist': 0,
         'Medical_Sales Executive': 1,
         'Medical_Sales Representative': 1,
         'Other_Healthcare Representative': 0,
         'Other_Human Resources': 0,
         'Other_Laboratory Technician': 0,
         'Other_Manager': 0,
         'Other_Manufacturing Director': 0,
         'Other_Research Director': 0,
         'Other_Research Scientist': 0,
         'Other_Sales Executive': 0,
         'Other_Sales Representative': 0,
         'Technical Degree_Healthcare Representative': 0,
         'Technical Degree_Human Resources': 0,
         'Technical Degree_Laboratory Technician': 0,
         'Technical Degree_Manager': 0,
         'Technical Degree_Manufacturing Director': 0,
         'Technical Degree_Research Director': 0,
         'Technical Degree_Research Scientist': 0,
         'Technical Degree_Sales Executive': 0,
         'Technical Degree_Sales Representative': 0
        }
        
        X['job_role_diff_edu_field'] = X['edu_field_job_role'].map(edu_field_job_role_map)
        
        X.drop('edu_field_job_role', axis = 1, inplace = True)
        
        return X['job_role_diff_edu_field'].sample(5)

In [15]:
EduFieldJobRole().fit_transform(people)
people

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,below_median_dpt_joblevel_monthly_income,job_role_diff_edu_field
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Yes,11,3,1,0,8,0,1,6,4,0,5,0.0,1
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,No,23,4,4,1,10,3,3,10,7,1,7,,0
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Yes,15,3,2,0,7,3,3,0,0,0,0,,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Yes,11,3,3,0,8,3,3,8,7,3,0,,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,No,12,3,4,1,6,3,3,2,2,2,2,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,41,4,2,Laboratory Technician,4,Married,2571,12290,4,No,17,3,3,1,17,3,3,5,2,0,3,,0
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,42,2,3,Healthcare Representative,1,Married,9991,21457,4,No,15,3,1,1,9,5,3,7,7,1,7,,0
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,87,4,2,Manufacturing Director,2,Married,6142,5174,1,Yes,20,4,2,1,6,0,3,6,2,0,3,,0
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,63,2,2,Sales Executive,2,Married,5390,13243,2,No,14,3,4,0,17,3,2,9,6,0,8,1.0,1


In [16]:
class PromotedLastTwoYears(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        
        X['promoted_last_2_years'] = \
        X['years_since_last_promotion'].apply(lambda x: 1 if x <= 2 else 0)
        
        return X['promoted_last_2_years'].sample(5)

In [18]:
PromotedLastTwoYears().fit_transform(people)
people

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,below_median_dpt_joblevel_monthly_income,job_role_diff_edu_field,promoted_last_2_years
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Yes,11,3,1,0,8,0,1,6,4,0,5,0.0,1,1
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,No,23,4,4,1,10,3,3,10,7,1,7,,0,1
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Yes,15,3,2,0,7,3,3,0,0,0,0,,0,1
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Yes,11,3,3,0,8,3,3,8,7,3,0,,0,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,No,12,3,4,1,6,3,3,2,2,2,2,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,41,4,2,Laboratory Technician,4,Married,2571,12290,4,No,17,3,3,1,17,3,3,5,2,0,3,,0,1
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,42,2,3,Healthcare Representative,1,Married,9991,21457,4,No,15,3,1,1,9,5,3,7,7,1,7,,0,1
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,87,4,2,Manufacturing Director,2,Married,6142,5174,1,Yes,20,4,2,1,6,0,3,6,2,0,3,,0,1
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,63,2,2,Sales Executive,2,Married,5390,13243,2,No,14,3,4,0,17,3,2,9,6,0,8,1.0,1,1


In [19]:
class IncomePerYearsWorked(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        assert isinstance(X, pd.DataFrame)
        
        total_workin_years_min = X[X['total_working_years'] == 0]['monthly_income'].min()
        
        def get_income_per_years_worked(x):
            if x == 0:
                X['m_income_per_total_years_worked'] = total_workin_years_min
            else:
                X['m_income_per_total_years_worked'] = \
                        X['monthly_income'] / X['total_working_years']
                
        X['total_working_years'].apply(get_income_per_years_worked)
        
        X[X['m_income_per_total_years_worked'] == np.inf]['m_income_per_total_years_worked'] = 1
        X['m_income_per_total_years_worked'].replace(np.inf, total_workin_years_min, inplace = True)
        
        return X['m_income_per_total_years_worked'].sample(5)

In [22]:
IncomePerYearsWorked().fit_transform(people)
people

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,below_median_dpt_joblevel_monthly_income,job_role_diff_edu_field,promoted_last_2_years,m_income_per_total_years_worked
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Yes,11,3,1,0,8,0,1,6,4,0,5,0.0,1,1,749.125000
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,No,23,4,4,1,10,3,3,10,7,1,7,,0,1,513.000000
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Yes,15,3,2,0,7,3,3,0,0,0,0,,0,1,298.571429
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Yes,11,3,3,0,8,3,3,8,7,3,0,,0,0,363.625000
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,No,12,3,4,1,6,3,3,2,2,2,2,,0,1,578.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,41,4,2,Laboratory Technician,4,Married,2571,12290,4,No,17,3,3,1,17,3,3,5,2,0,3,,0,1,151.235294
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,42,2,3,Healthcare Representative,1,Married,9991,21457,4,No,15,3,1,1,9,5,3,7,7,1,7,,0,1,1110.111111
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,87,4,2,Manufacturing Director,2,Married,6142,5174,1,Yes,20,4,2,1,6,0,3,6,2,0,3,,0,1,1023.666667
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,63,2,2,Sales Executive,2,Married,5390,13243,2,No,14,3,4,0,17,3,2,9,6,0,8,1.0,1,1,317.058824


In [None]:
class IncomePerAge(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        assert isinstance(X, pd.DataFrame)
        X['m_income_per_age'] = X['monthly_income'] / X['age']
        
        return X['m_income_per_age'].sample(5)

In [None]:
class SalaryHikeBelowMedian(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        assert isinstance(X, pd.DataFrame)
        
        salary_hike_median = X['percent_salary_hike'].median()

        X['below_median_pct_salary_hike'] = \
                X['percent_salary_hike'].apply(lambda x: 1 if x < salary_hike_median else 0)
        
        return X['below_median_pct_salary_hike'].sample(5)

In [None]:
salary_hike = SalaryHikeBelowMedian()
salary_hike.fit_transform(people)

In [None]:
# class FormatDatset(BaseEstimator, TransformerMixin):
    
#     def __init__(self):
#         pass
    
#     def fit(self, X, y = None):
#         return self
    
#     def transform(self, X, y = None):
#         assert isinstance(X, pd.DataFrame)
        
        

In [None]:
class MapBooleans(BaseEstimator, TransformerMixin):
        
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        assert isinstance(X, pd.DataFrame)
        
#         y = y.map({'No': 0, 'Yes': 1}) # COME BACK TO THIS IN THE FUTURE!!!

        X['gender_male'] = X['gender'].map({'Female': 0, 'Male': 1})

        X['over_time'] = X['over_time'].map({'No': 0, 'Yes': 1})
        
        return X[['attrition', 'gender_male', 'over_time']].sample(5)

In [None]:
map_bools = MapBooleans()
map_bools.fit_transform(people.drop(columns='attrition'), people['attrition'])

In [None]:
people