In [1]:
import pandas as pd
import numpy as np
from numpy import array
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [3]:
le = preprocessing.LabelEncoder()

In [4]:
def encode_col(col_name) :
    enc = le.fit(df[col_name])
    new_col_name = "e_"+col_name
    df[new_col_name] = df[col_name].map(lambda x: enc.transform([x]))
    df[new_col_name] = df[new_col_name].map(lambda x:x[0])
    return

In [5]:
encode_col('Attrition')

In [6]:
encode_col('BusinessTravel')

In [7]:
encode_col('Department')

In [8]:
encode_col('EducationField')

In [9]:
encode_col('Gender')

In [10]:
encode_col('JobRole')

In [11]:
encode_col('MaritalStatus')

In [12]:
encode_col('Over18')

In [13]:
encode_col('OverTime')

In [14]:
X = df[["Age","e_BusinessTravel","DailyRate","e_Department","DistanceFromHome","Education","e_EducationField",'EmployeeCount',"EmployeeNumber","EnvironmentSatisfaction","e_Gender","HourlyRate","JobInvolvement","JobLevel","e_JobRole","JobSatisfaction","e_MaritalStatus","MonthlyIncome","MonthlyRate","NumCompaniesWorked","e_Over18","e_OverTime","PercentSalaryHike","PerformanceRating","RelationshipSatisfaction","StandardHours","StockOptionLevel","TotalWorkingYears","TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany","YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager"]]

In [15]:
y = df["e_Attrition"]

In [16]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)

In [17]:
log_reg = linear_model.LogisticRegression(random_state=42)

In [18]:
rfe = RFE(log_reg, n_features_to_select=1)

In [19]:
trained_model=rfe.fit(x_train,y_train)

In [20]:
trained_model

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=1, step=1, verbose=0)

In [21]:
print(rfe.support_)

[False False False False False False False False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False False]


In [22]:
rank = rfe.ranking_

In [23]:
type(rank)

numpy.ndarray

In [24]:
rank.view()

array([26, 27, 30,  5, 24, 20, 18, 29, 32,  8,  7, 28,  4,  3, 21,  6,  2,
       31, 33, 13, 34,  1, 23, 22, 14, 25, 10, 19, 15,  9, 17, 11, 12, 16])

In [25]:
feature_names = list(X.columns.values)

In [26]:
features = array(feature_names)

In [27]:
type(features)

numpy.ndarray

In [28]:
features.view()

array(['Age', 'e_BusinessTravel', 'DailyRate', 'e_Department',
       'DistanceFromHome', 'Education', 'e_EducationField',
       'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction',
       'e_Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'e_JobRole',
       'JobSatisfaction', 'e_MaritalStatus', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'e_Over18', 'e_OverTime',
       'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='<U24')

In [29]:
con = np.column_stack((rank,features))

In [30]:
con.view()

array([['26', 'Age'],
       ['27', 'e_BusinessTravel'],
       ['30', 'DailyRate'],
       ['5', 'e_Department'],
       ['24', 'DistanceFromHome'],
       ['20', 'Education'],
       ['18', 'e_EducationField'],
       ['29', 'EmployeeCount'],
       ['32', 'EmployeeNumber'],
       ['8', 'EnvironmentSatisfaction'],
       ['7', 'e_Gender'],
       ['28', 'HourlyRate'],
       ['4', 'JobInvolvement'],
       ['3', 'JobLevel'],
       ['21', 'e_JobRole'],
       ['6', 'JobSatisfaction'],
       ['2', 'e_MaritalStatus'],
       ['31', 'MonthlyIncome'],
       ['33', 'MonthlyRate'],
       ['13', 'NumCompaniesWorked'],
       ['34', 'e_Over18'],
       ['1', 'e_OverTime'],
       ['23', 'PercentSalaryHike'],
       ['22', 'PerformanceRating'],
       ['14', 'RelationshipSatisfaction'],
       ['25', 'StandardHours'],
       ['10', 'StockOptionLevel'],
       ['19', 'TotalWorkingYears'],
       ['15', 'TrainingTimesLastYear'],
       ['9', 'WorkLifeBalance'],
       ['17', 'YearsAtCompany

In [31]:
predictions=trained_model.predict(x_test)
accuracy_score(y_test,predictions)


0.86734693877551017

In [32]:
accuracy_score(y_train,trained_model.predict(X_train))

NameError: name 'X_train' is not defined

In [None]:
confusion_matrix(y_test,predictions)