In [1]:
import pandas as pd   
import numpy as np    
import matplotlib.pyplot as plt 
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_curve, log_loss
SEED = 42

In [2]:
df=pd.read_csv("train.csv")

In [None]:
#emptest=pd.read_csv("test.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
emp.corr()

In [None]:
emp.describe()

In [None]:
def draw_histograms(dataframe, features, rows, cols):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        dataframe[feature].hist(bins=20,ax=ax,facecolor='midnightblue')
        ax.set_title(feature+" Distribution",color='DarkRed')
        
    fig.tight_layout()  
    plt.show()
draw_histograms(emp,emp.columns,8,4)

In [None]:
sns.countplot(x='Attrition',data=emp)

In [3]:

# to create new feature whether JobRole is related to EducationField, 0 = not related, 1 = related, 2 = somewhat related
# HR seems to have done fabulously in job matching as this feature did not emerge useful
df['EduField_Dept'] = 0
for i, row in df.iterrows():
    if row['EducationField']=='Human Resources' and row['Department']=='Human Resources':
        df['EduField_Dept'][i] = 1
    elif row['EducationField']=='Life Sciences' and row['Department']=='Sales':
        df['EduField_Dept'][i] = 2
    elif row['EducationField']=='Life Sciences' and row['Department']=='Research & Development':
        df['EduField_Dept'][i] = 1
    elif row['EducationField']=='Medical' and row['Department']=='Sales':
        df['EduField_Dept'][i] = 2
    elif row['EducationField']=='Medical' and row['Department']=='Research & Development':
        df['EduField_Dept'][i] = 1
    elif row['EducationField']=='Technical Degree' and row['Department']=='Sales':
        df['EduField_Dept'][i] = 2
    elif row['EducationField']=='Technical Degree' and row['Department']=='Research & Development':
        df['EduField_Dept'][i] = 1
    elif row['EducationField']=='Marketing' and row['Department']=='Sales':
        df['EduField_Dept'][i] = 1
        

In [4]:
# to drop useless features

# label encoding to clean up categorical data
categorical_data = { 
    'BusinessTravel': {'Non-Travel':0, 'Travel_Frequently':1, 'Travel_Rarely':2}, 
    'Department': {'Human Resources':0, 'Research & Development':1, 'Sales':2}, 
    'EducationField': {'Human Resources':0, 'Life Sciences':1, 'Marketing':2, 'Medical':3, 'Technical Degree':4, 'Other':5}, 
    'Gender': {'Female':0, 'Male':0}, 
    'JobRole': {'Healthcare Representative':0, 'Human Resources':1, 'Laboratory Technician':2, 'Manager':3, 'Manufacturing Director':4, 'Research Director':5, 'Research Scientist':6, 'Sales Executive':7, 'Sales Representative':8}, 
    'MaritalStatus': {'Divorced':0, 'Single':1, 'Married':2}, 
    #'Over18': {'Y':1}, 
    'OverTime': {'No':0, 'Yes':1}    }
df = df.replace(categorical_data)
df

Unnamed: 0,Id,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill,Behaviour,EduField_Dept
0,1,30,0,0,1,2,3,3,571,3,...,0,12,2,11,7,6,7,4,1,1
1,2,36,0,2,1,12,4,1,1614,3,...,2,7,2,3,2,1,1,2,1,1
2,3,55,1,2,2,2,1,3,842,3,...,0,12,3,9,7,7,3,5,1,2
3,4,39,0,2,1,24,1,1,2014,1,...,0,18,2,7,7,1,7,4,1,1
4,5,37,0,2,1,3,3,5,689,3,...,1,10,2,10,7,7,8,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1623,1624,42,1,1,1,19,3,3,752,3,...,0,7,2,2,2,2,2,3,1,1
1624,1625,55,1,2,2,2,1,3,842,3,...,0,12,3,9,7,7,3,5,1,2
1625,1626,25,1,2,2,9,2,1,1439,1,...,0,6,2,3,2,2,2,5,1,2
1626,1627,29,1,2,0,13,3,0,1844,1,...,3,4,3,2,2,2,0,5,1,1


In [5]:
X, y = df.drop('Attrition', axis=1), df['Attrition']

In [6]:
df = df.drop(columns=['TotalWorkingYears', 'YearsWithCurrManager'])

In [7]:
df = df.drop(columns=['YearsInCurrentRole'])

In [8]:

# re-run correlation matrix heatmap: YearsAtCompany & MonthlyIncome = 0.51
df = df.drop(columns=['YearsAtCompany'])

In [9]:
df = df.drop(columns=['Age'])

In [10]:
df = df.drop(columns=['EmployeeNumber'])

In [11]:
# define feature X and target y dataset (X and y are commonly used in sklearn)
X, y = df.drop('Attrition', axis=1), df['Attrition']
print('X', X.shape, 'y', y.shape)

# split data to 80:20 ratio for train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.29, random_state=SEED, stratify=y)
print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

# kf = KFold(n_splits=5, shuffle=True, random_state=SEED)   # this may result in imbalance classes in each fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)   # use this

X (1628, 23) y (1628,)
X_train (1155, 23)
y_train (1155,)
X_test (473, 23)
y_test (473,)


In [67]:
model=RandomForestClassifier(random_state = 1, n_estimators = 70,max_depth=1)

In [68]:
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=1, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [14]:
df1=pd.read_csv('test.csv')
df1.head()

Unnamed: 0,Id,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill,Behaviour
0,1,28,Travel_Rarely,Research & Development,9,3,Medical,377,4,Male,...,4,1,5,3,5,2,0,4,5,1
1,2,31,Travel_Rarely,Sales,6,4,Medical,653,1,Male,...,4,2,13,4,7,7,5,7,3,1
2,3,37,Travel_Rarely,Research & Development,6,3,Medical,474,3,Male,...,3,2,13,2,7,7,6,7,4,1
3,4,42,Travel_Rarely,Research & Development,1,2,Life Sciences,827,4,Female,...,3,1,8,4,4,3,0,2,5,1
4,5,45,Non-Travel,Research & Development,4,2,Life Sciences,972,3,Male,...,3,0,9,5,9,7,0,8,2,1


In [15]:

# to create new feature whether JobRole is related to EducationField, 0 = not related, 1 = related, 2 = somewhat related
# HR seems to have done fabulously in job matching as this feature did not emerge useful
df1['EduField_Dept'] = 0
for i, row in df.iterrows():
    if row['EducationField']=='Human Resources' and row['Department']=='Human Resources':
        df1['EduField_Dept'][i] = 1
    elif row['EducationField']=='Life Sciences' and row['Department']=='Sales':
        df1['EduField_Dept'][i] = 2
    elif row['EducationField']=='Life Sciences' and row['Department']=='Research & Development':
        df1['EduField_Dept'][i] = 1
    elif row['EducationField']=='Medical' and row['Department']=='Sales':
        df1['EduField_Dept'][i] = 2
    elif row['EducationField']=='Medical' and row['Department']=='Research & Development':
        df1['EduField_Dept'][i] = 1
    elif row['EducationField']=='Technical Degree' and row['Department']=='Sales':
        df1['EduField_Dept'][i] = 2
    elif row['EducationField']=='Technical Degree' and row['Department']=='Research & Development':
        df1['EduField_Dept'][i] = 1
    elif row['EducationField']=='Marketing' and row['Department']=='Sales':
        df1['EduField_Dept'][i] = 1
        

In [16]:
# to drop useless features

# label encoding to clean up categorical data
categorical_data = { 
    'BusinessTravel': {'Non-Travel':0, 'Travel_Frequently':1, 'Travel_Rarely':2}, 
    'Department': {'Human Resources':0, 'Research & Development':1, 'Sales':2}, 
    'EducationField': {'Human Resources':0, 'Life Sciences':1, 'Marketing':2, 'Medical':3, 'Technical Degree':4, 'Other':5}, 
    'Gender': {'Female':0, 'Male':0}, 
    'JobRole': {'Healthcare Representative':0, 'Human Resources':1, 'Laboratory Technician':2, 'Manager':3, 'Manufacturing Director':4, 'Research Director':5, 'Research Scientist':6, 'Sales Executive':7, 'Sales Representative':8}, 
    'MaritalStatus': {'Divorced':0, 'Single':1, 'Married':2}, 
    #'Over18': {'Y':1}, 
    'OverTime': {'No':0, 'Yes':1}    }
df1 = df1.replace(categorical_data)
df1

Unnamed: 0,Id,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill,Behaviour,EduField_Dept
0,1,28,2,1,9,3,3,377,4,0,...,1,5,3,5,2,0,4,5,1,0
1,2,31,2,2,6,4,3,653,1,0,...,2,13,4,7,7,5,7,3,1,0
2,3,37,2,1,6,3,3,474,3,0,...,2,13,2,7,7,6,7,4,1,0
3,4,42,2,1,1,2,1,827,4,0,...,1,8,4,4,3,0,2,5,1,0
4,5,45,0,1,4,2,1,972,3,0,...,0,9,5,9,7,0,8,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,466,32,2,1,2,4,1,1692,4,0,...,0,1,2,1,0,0,0,5,1,0
466,467,18,1,2,3,2,3,1624,2,0,...,0,0,2,0,0,0,0,2,1,0
467,468,24,2,1,23,3,3,639,2,0,...,2,6,3,6,5,1,4,1,1,0
468,469,31,2,1,23,3,3,367,2,0,...,1,10,2,9,0,7,8,3,1,0


In [17]:
df1 = df1.drop(columns=['TotalWorkingYears', 'YearsWithCurrManager'])

In [18]:
df1 = df1.drop(columns=['YearsInCurrentRole'])

In [19]:
df1 = df1.drop(columns=['YearsAtCompany'])

In [20]:
df1 = df1.drop(columns=['Age'])

In [22]:
df1 = df1.drop(columns=['EmployeeNumber'])

In [23]:
df1.shape

(470, 23)

In [24]:
df1.columns

Index(['Id', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
       'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
       'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TrainingTimesLastYear',
       'YearsSinceLastPromotion', 'CommunicationSkill', 'Behaviour',
       'EduField_Dept'],
      dtype='object')

In [25]:
df.shape

(1628, 24)

In [26]:
df.columns

Index(['Id', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
       'JobInvolvement', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TrainingTimesLastYear',
       'YearsSinceLastPromotion', 'CommunicationSkill', 'Behaviour',
       'EduField_Dept'],
      dtype='object')

In [69]:
y_pred = model.predict_proba(df1)[:,1]

In [70]:
accuracy1 = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy'))
accuracy1

0.9064935064935064

In [46]:
y_pred.shape

(470,)

In [59]:
d1 = {'Id': df1.loc[:,'Id'], 'Attrition':y_pred[:,]}

In [60]:
df2 = pd.DataFrame(d1)

In [37]:
df2.shape

(470, 2)

In [61]:
df2.head(10)

Unnamed: 0,Id,Attrition
0,1,0.400528
1,2,0.397526
2,3,0.420013
3,4,0.405402
4,5,0.370869
5,6,0.385331
6,7,0.419734
7,8,0.412381
8,9,0.418642
9,10,0.396529


In [62]:
attrition5_csv = df2.to_csv('C:/Users/Shubh/Desktop/DS projects/Summer analytics project/attrition5_csv.csv',index=False)