In [1]:
import numpy as np
import pandas as pd
from patsy import dmatrices #generate data for model training
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt
# from Jiuzhang ML Project

In [2]:
data = pd.read_csv('../input/HR_comma_sep.csv')
data

In [3]:
data.dtypes # look each column and data type of each column; object means string here

In [4]:
# observe the relationship between left and salary, as example of discrete variable
pd.crosstab(data.salary, data.left).plot(kind = 'bar')
plt.show()
# we see left = 0 is much more than left=1, we can also see percentages in the next line

In [5]:
# we actually want to see the percentage difference between left and not left, in each salary category
q = pd.crosstab(data.salary, data.left)
print(q.sum(0))
q.div(q.sum(0), axis = 1).plot(kind = 'bar')
plt.show()
# in high and medium, employees tend to stay, but in low salary, employees tend to leave
# for this dataset, we only have salary in low, med, and high; it would be more precise if we have salary numbers

In [6]:
q

In [7]:
# now look at distribution of satisfaction level, eample of continuous variable

In [8]:
data[data.left == 0].satisfaction_level.hist()
plt.show()
# Foe people who stay, their satisfaction is high. Cutoff is clearly at 0.5.

In [9]:
data[data.left == 1].satisfaction_level.hist()
plt.show()
# For people who left, many are not satisfied. Not clear cutoff.

In [23]:
# look work injuries
pd.crosstab(data.Work_accident, data.left).plot(kind = 'bar')
plt.show()

In [36]:
# look average monthly hours
pd.crosstab(data.average_montly_hours, data.left).plot(kind = 'hist')
plt.show()
# how to explain this? what does 0 hours mean??? Is this feature sensible?

In [29]:
# look at time spent company
pd.crosstab(data.time_spend_company, data.left).plot(kind = 'hist', stacked = True)
plt.show()

In [34]:
# look at promotion in last 5 years
pd.crosstab(data.promotion_last_5years, data.left).plot(kind = 'bar', stacked = False)
plt.show()

In [37]:
# look at sales
pd.crosstab(data.sales, data.left).plot(kind = 'bar', stacked = False)
plt.show()
# my guess is that sales means deparment here

In [10]:
# now start to use model
model = LogisticRegression()

In [11]:
data.dtypes

In [12]:
#                  y - x1+x2+x3...
y, X = dmatrices('left~satisfaction_level+last_evaluation+number_project+average_montly_hours+time_spend_company+Work_accident+promotion_last_5years+C(sales)+C(salary)', data, return_type='dataframe')

In [13]:
# y must be number vector
y = np.ravel(y)
y

In [24]:
sum(y)

In [14]:
model.fit(X, y)

In [15]:
print(model.score(X, y))

In [16]:
1 - sum(y) / len(y) # what if we predict 1 for all cases

In [17]:
# is this unbalanced data?
# yes it is, we can add weight to certain features

In [18]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [19]:
model2 = LogisticRegression()
model2.fit(Xtrain, ytrain)

In [20]:
pred = model2.predict(Xtest)
metrics.accuracy_score(ytest, pred)

In [21]:
print(metrics.classification_report(ytest, pred))

In [22]:
print(cross_val_score(LogisticRegression(C=1e5), X, y, scoring = 'accuracy', cv=10).mean())
# note this accuracy will be worse than mnodel1 because here we use cross validation to prevent overfitting