In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
import math
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.feature_selection import f_classif

In [2]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [3]:
df.shape

(1470, 35)

In [4]:
le = preprocessing.LabelEncoder()

In [5]:
def encode_col(col_name):
    encodes = le.fit(df[col_name])
    new_col_name = "e."+col_name
    df[new_col_name] = df[col_name].map(lambda x: encodes.transform([x]))
    df[new_col_name] = df[new_col_name].map(lambda x:x[0])
    return

In [6]:
encode_col('Attrition')
encode_col('BusinessTravel')
encode_col('Department')
encode_col('EducationField')
encode_col('Gender')
encode_col('JobRole')
encode_col('MaritalStatus')
encode_col('Over18')
encode_col('OverTime')

In [7]:
X = df[["Age","e.BusinessTravel","DailyRate","e.Department","DistanceFromHome","Education","e.EducationField",'EmployeeCount',"EmployeeNumber","EnvironmentSatisfaction","e.Gender","HourlyRate","JobInvolvement","JobLevel","e.JobRole","JobSatisfaction","e.MaritalStatus","MonthlyIncome","MonthlyRate","NumCompaniesWorked","e.Over18","e.OverTime","PercentSalaryHike","PerformanceRating","RelationshipSatisfaction","StandardHours","StockOptionLevel","TotalWorkingYears","TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany","YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager"]]

In [8]:
Y = df["e.Attrition"]

In [9]:
X.shape

(1470, 34)

In [10]:
Y.shape

(1470,)

In [11]:
feature_names = list(X.columns.values)

In [12]:
type(feature_names)

list

In [13]:
#selector = SelectKBest(chi2, k=10)

In [14]:
#selector = SelectKBest()

In [15]:
#selector.fit_transform(X, Y)

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.20,random_state = 4)

# SelectKBest + LR

In [17]:
pipe = make_pipeline(
    SelectKBest(chi2,k=20), 
    linear_model.LogisticRegression()
)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
trained_model = pipe.fit(X_train,y_train)

In [18]:
print(metrics.accuracy_score(y_test,y_pred))

0.884353741497


In [19]:
print(metrics.accuracy_score(y_train,trained_model.predict(X_train)))

0.859693877551


In [20]:
matrix = confusion_matrix(y_test,y_pred)
print(matrix)

[[247   4]
 [ 30  13]]


In [22]:
mean_squared_error(y_test,y_pred)

0.11564625850340136

In [23]:
r2_score(y_test,y_pred)

0.073844158250717884

In [24]:

#report = classification_report(y_test,y_pred)
#print(report)
#type(selector)

In [25]:
#mask = selector.get_support()

In [26]:
#params = selector.get_params(True)

In [27]:
#type(params)

In [28]:
#params.values()

In [29]:
#type(mask)

In [30]:
#mask.view()

In [31]:
#new_features = []

In [32]:
#for index, feature in zip(mask, feature_names):
    #if index:
        #new_features.append(feature)

In [33]:
#for p in new_features: 
    #print(p)

# SelectKBest + KNN

In [34]:
l = len(df.index)

In [35]:
pipe1 = make_pipeline(
    SelectKBest(chi2,k=20), 
    KNeighborsClassifier(n_neighbors=int(math.sqrt(l)))
)
pipe1.fit(X_train, y_train)
y_pred1 = pipe1.predict(X_test)
trained_model1 = pipe1.fit(X_train,y_train)

In [36]:
print(metrics.accuracy_score(y_test,y_pred1))

0.853741496599


In [37]:
print(metrics.accuracy_score(y_train,trained_model1.predict(X_train)))

0.835034013605


In [38]:
matrix = confusion_matrix(y_test,y_pred1)
print(matrix)

[[251   0]
 [ 43   0]]


In [40]:
mean_squared_error(y_test,y_pred1)

0.14625850340136054

In [41]:
r2_score(y_test,y_pred1)

-0.17131474103585664

In [None]:
#report = classification_report(y_test,y_pred1)
#print(report)

In [42]:
#clf=RandomForestClassifier(random_state=0)

In [43]:
#pipe1 = make_pipeline(
#    SelectKBest(chi2,k=20), 
#    RandomForestClassifier(random_state=0)
#)
#pipe1.fit(X_train, y_train)
#y_pred1 = pipe1.predict(X_test)
#trained_model1 = pipe1.fit(X_train,y_train)

In [44]:
#print(metrics.accuracy_score(y_test,y_pred1))

In [45]:
#print(metrics.accuracy_score(y_train,trained_model1.predict(X_train)))

In [46]:
#matrix = confusion_matrix(y_test,y_pred1)
#print(matrix)

In [47]:
#for i in range(1,25) :
#pipe1 = make_pipeline(
#        SelectKBest(chi2,k=20), 
#        RandomForestClassifier(random_state=0)
#    )
#pipe1.fit(X_train, y_train)
#y_pred1 = pipe1.predict(X_test)
#print(metrics.accuracy_score(y_test,y_pred))
#matrix = confusion_matrix(y_test,y_pred1)
#print(matrix)