In [149]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from statistics import mean
from apyori import apriori

df = pd.read_csv("./Dataset/Train_HR_Employee_Attrition.csv")

In [150]:
#SETTING TRAINING SET (as executed in DataUnderstanding-attrition)


#MISSING VALUES

#infer Gender by Gender mode
df['Gender']=df['Gender'].fillna(df['Gender'].mode()[0])

#infer BusinessTravel by BusinessTravel mode
df['BusinessTravel']=df['BusinessTravel'].fillna(df['BusinessTravel'].mode()[0])

#infer PerformanceRating by PerformanceRating mode
df['PerformanceRating'] = df['PerformanceRating'].fillna(df['PerformanceRating'].mode()[0])

#infer YearsAtCompany by YearsAtCompany mean
df['YearsAtCompany'] = df['YearsAtCompany'].fillna(df['YearsAtCompany'].median())

#infer MonthlyIncome by mean grouped by YearsAtCompany-Quartiles
bins = pd.qcut(df['YearsAtCompany'],[0,0.25,0.50,0.75,1.0])
df['MonthlyIncome'] = df['MonthlyIncome'].groupby(bins).apply(
                      lambda x: x.fillna(x.mean()))

#infer Age by mean grouped by MonthlyIncome-quartiles
bins = pd.qcut(df['MonthlyIncome'],[0,0.25,0.50,0.75,1.0])
df['Age'] = df['Age'].groupby(bins).apply(lambda x: x.fillna(x.mean()))

#TrainingTimesLastYear by fillna with TrainingTimesLastYear values probability
df['TrainingTimesLastYear'] = df['TrainingTimesLastYear'].fillna(df['TrainingTimesLastYear'].mode()[0])


#OUTLIERS
df["MonthlyIncomeTrans"] = np.log(df['MonthlyIncome'])


#DIENSIONALITY REDUCTION
df['YearsMean'] = None
for index, row in df.iterrows(): 
    df.loc[index, 'YearsMean'] = mean((row['YearsInCurrentRole'], row['YearsSinceLastPromotion'], row['YearsWithCurrManager']))

df = df.drop(columns = ['MonthlyIncome','YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager','Over18','StandardHours'], axis = 'columns')

In [151]:
for row in df.columns: 
    print(row)

Age
Attrition
BusinessTravel
DailyRate
Department
DistanceFromHome
Education
EducationField
EnvironmentSatisfaction
Gender
HourlyRate
JobInvolvement
JobLevel
JobRole
JobSatisfaction
MaritalStatus
MonthlyRate
NumCompaniesWorked
OverTime
PercentSalaryHike
PerformanceRating
RelationshipSatisfaction
StockOptionLevel
TotalWorkingYears
TrainingTimesLastYear
WorkLifeBalance
YearsAtCompany
MonthlyIncomeTrans
YearsMean


In [152]:
#attributi numerici
num_attr= ['Age','DailyRate','DistanceFromHome','HourlyRate','MonthlyIncomeTrans','MonthlyRate','NumCompaniesWorked',
           'PercentSalaryHike','TotalWorkingYears','TrainingTimesLastYear','YearsAtCompany','YearsMean',]

#attributi categorici
cat_attr = ['Attrition','BusinessTravel','Department','Education','EducationField','EnvironmentSatisfaction',
            'Gender','JobInvolvement','JobLevel','JobRole','JobSatisfaction','MaritalStatus','OverTime',
            'PerformanceRating','RelationshipSatisfaction','StockOptionLevel','WorkLifeBalance']

to_remove = ['DailyRate', 'MonthlyRate', 'HourlyRate', 'PercentSalaryHike']
df.drop(columns=to_remove, axis=1, inplace=True)

In [153]:
#Binning non-categorical attributes 
df['Age'] = pd.qcut(df['Age'], q=4)
df['DistanceFromHome'] = pd.qcut(df['DistanceFromHome'], q=4)
df['MonthlyIncomeTrans'] = pd.qcut(df['MonthlyIncomeTrans'], q=4)
df['NumCompaniesWorked'] = pd.qcut(df['NumCompaniesWorked'], q=4)
df['TotalWorkingYears'] = pd.qcut(df['TotalWorkingYears'], q=4, duplicates = 'drop')
df['TrainingTimesLastYear'] = pd.qcut(df['TrainingTimesLastYear'], q=4, duplicates='drop')
df['YearsAtCompany'] = pd.qcut(df['YearsAtCompany'], q=4)
df['YearsMean'] = pd.qcut(df['YearsMean'], q=4)

In [154]:
#Converting int values in string 
df['DistanceFromHome'] = 'DistanceFromHome:' + df['DistanceFromHome'].astype(str)
df['Education'] = 'Education:' + df['Education'].astype(str)
df['EnvironmentSatisfaction'] = 'EnvironmentSatisfaction:' + df['EnvironmentSatisfaction'].astype(str)
df['JobInvolvement'] = 'JobInvolvement:' + df['JobInvolvement'].astype(str)
df['JobLevel'] = 'JobLevel:' + df['JobLevel'].astype(str)
df['JobSatisfaction'] = 'JobSatisfaction:' + df['JobSatisfaction'].astype(str)
df['MonthlyIncomeTrans'] = 'MonthlyIncomeTrans:' + df['MonthlyIncomeTrans'].astype(str)
df['NumCompaniesWorked'] = 'NumCompaniesWorked:' + df['NumCompaniesWorked'].astype(str)
df['PerformanceRating'] = 'PerformanceRating:' + df['PerformanceRating'].astype(str)
df['RelationshipSatisfaction'] = 'RelationshipSatisfaction:' + df['RelationshipSatisfaction'].astype(str)
df['StockOptionLevel'] = 'StockOptionLevel:' + df['StockOptionLevel'].astype(str)
df['TotalWorkingYears'] = 'TotalWorkingYears:' + df['TotalWorkingYears'].astype(str)
df['TrainingTimesLastYear'] = 'TrainingTimesLastYear:' + df['TrainingTimesLastYear'].astype(str)
df['WorkLifeBalance'] = 'WorkLifeBalance:' + df['WorkLifeBalance'].astype(str)
df['YearsAtCompany'] = 'YearsAtCompany:' + df['YearsAtCompany'].astype(str)
df['YearsMean'] = 'YearsMean:' + df['YearsMean'].astype(str)


df.to_csv("./Dataset/AR.csv", header=False, index=False)
len(df)

1176

In [155]:
#Association Rules 
ar = pd.read_csv("./Dataset/AR.csv", header=None)
itemsets = apriori(ar.values.tolist(), min_support=0.3, min_confidence=0.7, min_lift=1.01, min_length=2)

ar = list(itemsets)
ar

[RelationRecord(items=frozenset({'Education:3', 'PerformanceRating:3.0'}), support=0.342687074829932, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Education:3'}), items_add=frozenset({'PerformanceRating:3.0'}), confidence=0.8779956427015251, lift=1.0142660862642372)]),
 RelationRecord(items=frozenset({'Education:3', 'Travel_Rarely'}), support=0.30272108843537415, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Education:3'}), items_add=frozenset({'Travel_Rarely'}), confidence=0.775599128540305, lift=1.0471923939878285)]),
 RelationRecord(items=frozenset({'PerformanceRating:3.0', 'JobLevel:1'}), support=0.33418367346938777, ordered_statistics=[OrderedStatistic(items_base=frozenset({'JobLevel:1'}), items_add=frozenset({'PerformanceRating:3.0'}), confidence=0.8811659192825113, lift=1.0179284097016044)]),
 RelationRecord(items=frozenset({'Research & Development', 'JobLevel:1'}), support=0.304421768707483, ordered_statistics=[OrderedStatistic(items_base=frozenset(

In [168]:
results = []
for item in ar: 
    pair =[]
    items = [x for x in pair]
    
    
    for i in item[0]: 
        pair.append(i)
    val2 = str(item[1])[:7]
    val3 = str(item[2][0][2])[:7]
    val4 = str(item[2][0][3])[:7]
    
    results.append((pair, val2, val3, val4))
    
    
label = ['title1', 'title2', 'title3', 'title4', 'title5', 'supp', 'conf', 'lift']
    
sugg = pd.DataFrame.from_records(results)
#sugg
sugg

Unnamed: 0,0,1,2,3
0,"[Education:3, PerformanceRating:3.0]",0.34268,0.87799,1.01426
1,"[Education:3, Travel_Rarely]",0.30272,0.77559,1.04719
2,"[PerformanceRating:3.0, JobLevel:1]",0.33418,0.88116,1.01792
3,"[Research & Development, JobLevel:1]",0.30442,0.80269,1.22752
4,"[JobLevel:2, No]",0.34948,0.95804,1.05
5,"[Married, No]",0.42857,0.92988,1.01915
6,"[Married, Travel_Rarely]",0.34948,0.7583,1.02383
7,"[StockOptionLevel:1, No]",0.3784,0.9468,1.03769
8,"[No, TotalWorkingYears:(6.0, 10.0]]",0.30357,0.92727,1.01628
9,"[NumCompaniesWorked:(-0.001, 1.0], Travel_Rarely]",0.37414,0.75085,1.01378
