### 04-03-2020

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
hr_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/HR_comma_sep.csv.txt')
hr_data.rename(columns={'sales':'dept'},inplace=True)

In [3]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
hr_data.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

### Inferances
Target(left) is discrete in nature so a classification problem

In [5]:
target_data = hr_data.left

### Taking care of categorical columns
dept & salary is categorical
ML algorithms only understands numerical data
We will use preprocessor LabelEncoder to convert this into numerical data


In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le_salary = LabelEncoder()

In [8]:
le_salary.fit(hr_data.salary)

LabelEncoder()

In [9]:
le_salary.transform(['low','medium','low'])

array([1, 2, 1], dtype=int32)

In [10]:
hr_data['salary_en'] = le_salary.transform(hr_data.salary)

In [11]:
le = LabelEncoder()

In [12]:
le.fit_transform(hr_data.salary)

array([1, 2, 2, ..., 1, 1, 1])

In [13]:
le_dept = LabelEncoder()

In [14]:
le_dept.fit(hr_data.dept)

LabelEncoder()

In [15]:
le_dept.transform(['sales','support'])

array([7, 8], dtype=int32)

In [16]:
le_dept.classes_

array(['IT', 'RandD', 'accounting', 'hr', 'management', 'marketing',
       'product_mng', 'sales', 'support', 'technical'], dtype=object)

In [17]:
hr_data['dept_en'] = le_dept.transform(hr_data.dept)

In [18]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,salary_en,dept_en
0,0.38,0.53,2,157,3,0,1,0,sales,low,1,7
1,0.8,0.86,5,262,6,0,1,0,sales,medium,2,7
2,0.11,0.88,7,272,4,0,1,0,sales,medium,2,7
3,0.72,0.87,5,223,5,0,1,0,sales,low,1,7
4,0.37,0.52,2,159,3,0,1,0,sales,low,1,7


In [19]:
#getting feature data from the complete information
feature_data = hr_data.drop(columns=['dept','salary','left'])

In [20]:
#splitting feature & target data into train & test
from sklearn.model_selection import train_test_split

In [21]:
trainX, testX, trainY, testY = train_test_split(feature_data, target_data)

In [22]:
trainX[:2]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en
10389,0.5,0.76,3,174,3,0,0,2,7
2530,0.49,0.8,2,275,2,0,0,1,7


In [23]:

# Observation : You are not training the model with string information

### Classification Algorithms
LogisticRegression, RandomForestClassifier, DecisicionTreeClassifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rf = RandomForestClassifier()

In [26]:
rf.fit(trainX,trainY)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [27]:
data = [0.38,0.53,2,157,3,1,0,'support','high']

In [28]:
le_salary.transform(['high'])

array([0], dtype=int32)

In [29]:
le_dept.transform(['support'])

array([8], dtype=int32)

In [30]:
rf.predict(testX)

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [31]:
rf.score(testX,testY)

0.9890666666666666

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
lr = LogisticRegression()

In [34]:
lr.fit(trainX,trainY)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [35]:
lr.score(testX,testY)

0.7677333333333334

In [36]:
lr.predict(testX)

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [37]:
testX['lr_predict'] = lr.predict(testX)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
testX['rf_predict'] = rf.predict(testX.drop(columns=['lr_predict']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
testX['actual'] = testY

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [40]:
testX.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en,lr_predict,rf_predict,actual
6827,0.53,0.82,3,133,3,0,0,2,3,0,0,0
8671,0.58,0.94,4,225,2,0,0,1,9,0,0,0
12768,0.1,0.84,6,293,5,0,0,2,5,1,1,1
1415,0.2,0.9,6,138,3,0,0,1,9,0,0,1
5443,0.62,0.74,4,173,2,0,0,2,7,0,0,0


In [41]:
testX[testX.actual != testX.rf_predict]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en,lr_predict,rf_predict,actual
1415,0.2,0.9,6,138,3,0,0,1,9,0,0,1
14780,0.25,0.46,4,214,4,0,0,2,9,1,0,1
279,0.82,0.81,4,233,4,1,0,1,8,0,0,1
5697,0.81,0.98,6,196,2,0,0,1,8,0,1,0
827,0.7,0.74,6,136,3,0,0,1,0,0,0,1
1576,0.6,0.85,3,250,2,0,0,1,8,0,0,1
1199,0.75,0.74,6,134,3,0,0,2,8,0,0,1
1831,0.14,0.47,4,175,2,0,0,2,1,0,0,1
931,0.9,0.92,5,154,4,0,0,1,7,0,0,1
7443,0.85,0.96,4,240,6,0,0,2,9,0,1,0
