In [1]:
import pandas as pd
import numpy as np

In [2]:
emp_data = pd.read_csv('data/HR_comma_sep.csv.txt')

In [3]:
emp_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [5]:
emp_data.rename(columns={'sales':'department'}, inplace=True)

In [6]:
emp_data.corr()['left']

satisfaction_level      -0.388375
last_evaluation          0.006567
number_project           0.023787
average_montly_hours     0.071287
time_spend_company       0.144822
Work_accident           -0.154622
left                     1.000000
promotion_last_5years   -0.061788
Name: left, dtype: float64

#### Selecting categorical columns & integer columns

In [7]:
cat_emp_data = emp_data.select_dtypes('object')

In [8]:
int_emp_data = emp_data.select_dtypes('int64')

#### Preprocessing Categorical Columns

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [10]:
le = LabelEncoder()
ohe = OneHotEncoder()

In [11]:
le.fit(cat_emp_data.department)

LabelEncoder()

In [12]:
cat_emp_data['department_tf'] = le.transform(cat_emp_data.department)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
cat_emp_data.head()

Unnamed: 0,department,salary,department_tf
0,sales,low,7
1,sales,medium,7
2,sales,medium,7
3,sales,low,7
4,sales,low,7


In [15]:
le.classes_

array(['IT', 'RandD', 'accounting', 'hr', 'management', 'marketing',
       'product_mng', 'sales', 'support', 'technical'], dtype=object)

In [16]:
le.inverse_transform([7])

array(['sales'], dtype=object)

In [17]:
ohe.fit(cat_emp_data[['department_tf']])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [18]:
department_tf = ohe.transform(cat_emp_data[['department_tf']]).toarray()

In [19]:
from sklearn.preprocessing import FunctionTransformer

In [20]:
def func(x):
    def mapping(d):
        if d == 'low':
            return 1
        elif d == 'medium':
            return 2
        else:
            return 3
    return x.map( mapping )
        
ft = FunctionTransformer(func, validate=False)

In [21]:
cat_emp_data['salary_tf'] = ft.transform(cat_emp_data.salary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


#### Preprocessing Number Data

In [22]:
int_emp_data.head()

Unnamed: 0,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
0,2,157,3,0,1,0
1,5,262,6,0,1,0
2,7,272,4,0,1,0
3,5,223,5,0,1,0
4,2,159,3,0,1,0


In [23]:
int_emp_data.drop('left',axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [25]:
from sklearn.preprocessing import MinMaxScaler

In [26]:
mms = MinMaxScaler()

In [27]:
mms.fit(int_emp_data)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [28]:
int_tf = mms.transform(int_emp_data)

In [29]:
int_tf

array([[0.        , 0.28504673, 0.125     , 0.        , 0.        ],
       [0.6       , 0.77570093, 0.5       , 0.        , 0.        ],
       [1.        , 0.82242991, 0.25      , 0.        , 0.        ],
       ...,
       [0.        , 0.21962617, 0.125     , 0.        , 0.        ],
       [0.8       , 0.85981308, 0.25      , 0.        , 0.        ],
       [0.        , 0.28971963, 0.125     , 0.        , 0.        ]])

In [30]:
float_tf = emp_data[['satisfaction_level','last_evaluation']].values

In [31]:
cat_emp_data['salary_tf'].values.shape

(14999,)

In [32]:
int_tf.shape

(14999, 5)

In [33]:
department_tf[:2]

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])

#### Joining the data

In [34]:
salary_tf = cat_emp_data['salary_tf'].values.reshape(-1,1)

In [35]:
feature_data = np.hstack([department_tf,int_tf,float_tf,salary_tf])

In [36]:
target_data = emp_data.left

In [55]:
feature_data

array([[0.  , 0.  , 0.  , ..., 0.38, 0.53, 1.  ],
       [0.  , 0.  , 0.  , ..., 0.8 , 0.86, 2.  ],
       [0.  , 0.  , 0.  , ..., 0.11, 0.88, 2.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.37, 0.53, 1.  ],
       [0.  , 0.  , 0.  , ..., 0.11, 0.96, 1.  ],
       [0.  , 0.  , 0.  , ..., 0.37, 0.52, 1.  ]])

In [39]:
target_data

0        1
1        1
2        1
3        1
4        1
        ..
14994    1
14995    1
14996    1
14997    1
14998    1
Name: left, Length: 14999, dtype: int64

#### Split data into two parts

In [40]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(feature_data,target_data)

#### Model Training

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [42]:
lr = LogisticRegression()

In [43]:
lr.fit(trainX,trainY)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
rf = RandomForestClassifier(max_depth=7)

In [57]:
rf.fit(trainX,trainY)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### Model Validation

In [58]:
lr.score(testX,testY)

0.7936

In [59]:
rf.score(testX,testY)

0.9706666666666667

In [60]:
from sklearn.metrics import recall_score,precision_score, f1_score, classification_report

In [61]:
pred = rf.predict(testX)

In [62]:
precision_score(y_pred=pred, y_true=testY)

0.9686323713927227

In [63]:
f1_score(y_pred=pred, y_true=testY)

0.9334945586457073

In [64]:
print (classification_report(y_pred=pred, y_true=testY))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2893
           1       0.97      0.90      0.93       857

    accuracy                           0.97      3750
   macro avg       0.97      0.95      0.96      3750
weighted avg       0.97      0.97      0.97      3750



In [54]:
rf.feature_importances_

array([0.00175641, 0.00154798, 0.00194101, 0.00191524, 0.00175489,
       0.00108971, 0.00090355, 0.00292225, 0.00292361, 0.00387467,
       0.24201622, 0.17907235, 0.20604084, 0.00976136, 0.00170345,
       0.19963087, 0.12867097, 0.01247463])