In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC



**Data Pre-processing**

In [None]:
# read data
df = pd.read_csv("https://raw.githubusercontent.com/mwitiderrick/kerasDO/master/HR_comma_sep.csv")

In [None]:
# examine the shape
df.shape

(14999, 10)

In [None]:
# examine the first 10 rows
df.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
5,0.41,0.5,2,153,3,0,1,0,sales,low
6,0.1,0.77,6,247,4,0,1,0,sales,low
7,0.92,0.85,5,259,5,0,1,0,sales,low
8,0.89,1.0,5,224,5,0,1,0,sales,low
9,0.42,0.53,2,142,3,0,1,0,sales,low


In [None]:
# examine the class distribution
df.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

The **get_dummies()** function is used to convert categorical variable into dummy/indicator variables.

In [None]:
# Convert Categorical Variable to Numeric
df = pd.get_dummies(df, columns=['department', 'salary'], drop_first=True)

In [None]:
df.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
5,0.41,0.5,2,153,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
6,0.1,0.77,6,247,4,0,1,0,0,0,0,0,0,0,1,0,0,1,0
7,0.92,0.85,5,259,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
8,0.89,1.0,5,224,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
9,0.42,0.53,2,142,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


**Separating Training and Testing Datasets**

In [None]:
# define X and y from the data
X = df.drop(['left'],axis=1).values
y = df.left
print(X.shape)
print(y.shape)

(14999, 18)
(14999,)


In [None]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,test_size=0.20,stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11999, 18)
(3000, 18)
(11999,)
(3000,)


**`Transforming the Data`**

In [None]:
# scale the training set and the test set
sc = StandardScaler()
sc = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

**Building the Model**

- Logistic Regression

In [None]:
#Applying LogisticRegression Model 
LogisticRegressionModel = LogisticRegression(penalty='l2',solver='sag',C=1.0,random_state=20)
LogisticRegressionModel.fit(X_train, y_train)


LogisticRegression(random_state=20, solver='sag')

In [None]:
#Calculating Details
print('LogisticRegressionModel Train Score is : ' , LogisticRegressionModel.score(X_train, y_train))
print('LogisticRegressionModel Test Score is : ' , LogisticRegressionModel.score(X_test, y_test))
print('LogisticRegressionModel Classes are : ' , LogisticRegressionModel.classes_)
print('LogisticRegressionModel No. of iteratios is : ' , LogisticRegressionModel.n_iter_)
print('----------------------------------------------------')


LogisticRegressionModel Train Score is :  0.7899824985415451
LogisticRegressionModel Test Score is :  0.7916666666666666
LogisticRegressionModel Classes are :  [0 1]
LogisticRegressionModel No. of iteratios is :  [27]
----------------------------------------------------


In [None]:
#Calculating Prediction
y_pred = LogisticRegressionModel.predict(X_test)
y_pred_prob = LogisticRegressionModel.predict_proba(X_test)
print('Predicted Value for LogisticRegressionModel is : ' , y_pred[:10])
print('Prediction Probabilities Value for LogisticRegressionModel is : ' , y_pred_prob[:10])


Predicted Value for LogisticRegressionModel is :  [0 1 0 0 0 0 0 0 0 1]
Prediction Probabilities Value for LogisticRegressionModel is :  [[0.85303609 0.14696391]
 [0.31975633 0.68024367]
 [0.7600967  0.2399033 ]
 [0.88699855 0.11300145]
 [0.76245577 0.23754423]
 [0.87434881 0.12565119]
 [0.91821849 0.08178151]
 [0.8748963  0.1251037 ]
 [0.93701419 0.06298581]
 [0.39242779 0.60757221]]


- SVC

In [None]:
#Applying SVC Model 
SVCModel = SVC(kernel= 'rbf',# it can be also linear,poly,sigmoid,precomputed
               max_iter=1000,C=1.0,gamma='auto')
SVCModel.fit(X_train, y_train)




SVC(gamma='auto', max_iter=1000)

In [None]:
#Calculating Details
print('SVCModel Train Score is : ' , SVCModel.score(X_train, y_train))
print('SVCModel Test Score is : ' , SVCModel.score(X_test, y_test))




SVCModel Train Score is :  0.9483290274189515
SVCModel Test Score is :  0.952
----------------------------------------------------


In [None]:
#Calculating Prediction
y_pred = SVCModel.predict(X_test)
print('Predicted Value for SVCModel is : ' , y_pred[:10])

Predicted Value for SVCModel is :  [0 0 0 0 0 0 0 0 0 1]


**Making a Single Prediction**

In [None]:
# new data
col = np.array([[0.26,0.7 ,3., 238., 6., 0.,0.,0.,0., 0.,0.,0.,0.,0.,1.,0., 0.,1.]])

In [None]:
new_pred = SVCModel.predict(sc.transform(col))

In [None]:
new_pred

array([0])

You can see in your output that the employee wonâ€™t leave the company