In [51]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from statistics import mean
from apriori_python import apriori

df = pd.read_csv("./Dataset/Train_HR_Employee_Attrition.csv")

In [52]:
#SETTING TRAINING SET (as executed in DataUnderstanding-attrition)


#MISSING VALUES

#infer Gender by Gender mode
df['Gender']=df['Gender'].fillna(df['Gender'].mode()[0])

#infer BusinessTravel by BusinessTravel mode
df['BusinessTravel']=df['BusinessTravel'].fillna(df['BusinessTravel'].mode()[0])

#infer PerformanceRating by PerformanceRating mode
df['PerformanceRating'] = df['PerformanceRating'].fillna(df['PerformanceRating'].mode()[0])

#infer YearsAtCompany by YearsAtCompany mean
df['YearsAtCompany'] = df['YearsAtCompany'].fillna(df['YearsAtCompany'].median())

#infer MonthlyIncome by mean grouped by YearsAtCompany-Quartiles
bins = pd.qcut(df['YearsAtCompany'],[0,0.25,0.50,0.75,1.0])
df['MonthlyIncome'] = df['MonthlyIncome'].groupby(bins).apply(
                      lambda x: x.fillna(x.mean()))

#infer Age by mean grouped by MonthlyIncome-quartiles
bins = pd.qcut(df['MonthlyIncome'],[0,0.25,0.50,0.75,1.0])
df['Age'] = df['Age'].groupby(bins).apply(lambda x: x.fillna(x.mean()))

#TrainingTimesLastYear by fillna with TrainingTimesLastYear values probability
df['TrainingTimesLastYear'] = df['TrainingTimesLastYear'].fillna(df['TrainingTimesLastYear'].mode()[0])


#OUTLIERS
df["MonthlyIncomeTrans"] = np.log(df['MonthlyIncome'])


#DIENSIONALITY REDUCTION
df['YearsMean'] = None
for index, row in df.iterrows(): 
    df.loc[index, 'YearsMean'] = mean((row['YearsInCurrentRole'], row['YearsSinceLastPromotion'], row['YearsWithCurrManager']))

df = df.drop(columns = ['MonthlyIncome','YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager','Over18','StandardHours'], axis = 'columns')

In [53]:
for row in df.columns: 
    print(row)

Age
Attrition
BusinessTravel
DailyRate
Department
DistanceFromHome
Education
EducationField
EnvironmentSatisfaction
Gender
HourlyRate
JobInvolvement
JobLevel
JobRole
JobSatisfaction
MaritalStatus
MonthlyRate
NumCompaniesWorked
OverTime
PercentSalaryHike
PerformanceRating
RelationshipSatisfaction
StockOptionLevel
TotalWorkingYears
TrainingTimesLastYear
WorkLifeBalance
YearsAtCompany
MonthlyIncomeTrans
YearsMean


In [60]:
df['TotalWorkingYears']

0        (17.0, 40.0]
1        (17.0, 40.0]
2          (5.0, 8.0]
3        (10.0, 17.0]
4       (-0.001, 5.0]
            ...      
1171    (-0.001, 5.0]
1172     (10.0, 17.0]
1173    (-0.001, 5.0]
1174     (10.0, 17.0]
1175    (-0.001, 5.0]
Name: TotalWorkingYears, Length: 1176, dtype: category
Categories (5, interval[float64]): [(-0.001, 5.0] < (5.0, 8.0] < (8.0, 10.0] < (10.0, 17.0] < (17.0, 40.0]]

In [55]:
#attributi numerici
num_attr= ['Age','DailyRate','DistanceFromHome','HourlyRate','MonthlyIncomeTrans','MonthlyRate','NumCompaniesWorked',
           'PercentSalaryHike','TotalWorkingYears','TrainingTimesLastYear','YearsAtCompany','YearsMean',]

#attributi categorici
cat_attr = ['Attrition','BusinessTravel','Department','Education','EducationField','EnvironmentSatisfaction',
            'Gender','JobInvolvement','JobLevel','JobRole','JobSatisfaction','MaritalStatus','OverTime',
            'PerformanceRating','RelationshipSatisfaction','StockOptionLevel','WorkLifeBalance']

to_remove = ['DailyRate', 'MonthlyRate', 'HourlyRate', 'PercentSalaryHike']
df.drop(columns=to_remove, axis=1, inplace=True)

In [62]:
#Binning non-categorical attributes 
#df['Age'] = pd.qcut(df['Age'], q=4)
#df['DistanceFromHome'] = pd.qcut(df['DistanceFromHome'], q=4)
#df['MonthlyIncomeTrans'] = pd.qcut(df['MonthlyIncomeTrans'], q=4)
#df['NumCompaniesWorked'] = pd.qcut(df['NumCompaniesWorked'], q=4)
#df['TotalWorkingYears'] = pd.qcut(df['TotalWorkingYears'], q=4, duplicates = 'drop')
df['TrainingTimesLastYear'] = pd.qcut(df['TrainingTimesLastYear'], q=4, duplicates='drop')
df['YearsAtCompany'] = pd.qcut(df['YearsAtCompany'], q=4)
df['YearsMean'] = pd.qcut(df['YearsMean'], q=4)

In [23]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,MonthlyIncomeTrans,YearsMean
0,"(43.0, 60.0]",No,Travel_Rarely,593,Research & Development,9,4,Medical,2,Male,...,17,3.0,3,0,20,2.0,2,8.0,9.041448,7.33333
1,"(31.0, 36.0]",No,Travel_Rarely,1218,Research & Development,1,1,Life Sciences,2,Male,...,14,3.0,3,1,21,3.0,3,1.0,8.48343,5.0
2,"(31.0, 36.0]",No,Travel_Frequently,530,Sales,16,3,Life Sciences,3,Male,...,25,3.0,3,1,7,4.0,3,1.0,8.469682,2.66667
3,"(31.0, 36.0]",No,Travel_Rarely,953,Research & Development,5,4,Technical Degree,2,Male,...,14,3.0,2,0,12,1.0,3,3.0,7.937017,4.66667
4,"(36.0, 43.0]",No,Travel_Rarely,1380,Research & Development,9,2,Life Sciences,3,Female,...,12,3.0,3,0,2,2.0,3,10.0,8.984443,1.66667
