In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [3]:
hr_url = 'https://raw.githubusercontent.com/skathirmani/datasets/master/HR%20Analytics.csv'
hr = pd.read_csv(hr_url)

hr_dummies = pd.get_dummies(hr)

train, test = train_test_split(hr_dummies, test_size = 0.3, random_state = 100)

train_y = train['Attrition']
test_y = test['Attrition']
train_x = train.drop('Attrition', axis = 1)
test_x = test.drop('Attrition', axis = 1)

from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(random_state=100)
model_rf.fit(train_x, train_y)

test_pred = model_rf.predict(test_x)
df_pred = pd.DataFrame({'actual' : test_y,
                       'predicted' : test_pred})
df_pred['pred_status'] = df_pred['actual'] == df_pred['predicted']
df_pred['pred_status'].sum() / df_pred.shape[0] * 100



85.71428571428571

In [12]:
df_pred['pred_status'].value_counts()

True     378
False     63
Name: pred_status, dtype: int64

In [16]:
## True Positive
tp = df_pred[(df_pred['predicted'] == 1) & (df_pred['actual']==1)].shape[0]
## True Negative
tn = df_pred[(df_pred['predicted'] == 0) & (df_pred['actual']==0)].shape[0]
## False Positive
fp = df_pred[(df_pred['predicted'] == 1) & (df_pred['actual']==0)].shape[0]
## False Negative
fn = df_pred[(df_pred['predicted'] == 0) & (df_pred['actual']==1)].shape[0]

print('True Positive: %d' % tp)
print('True Negative: %d' % tn)
print('False Positive %d' % fp)
print('False Negative %d' % fn)

True Positive: 11
True Negative: 367
False Positive 4
False Negative 59


In [18]:
from sklearn.metrics import confusion_matrix

confusion_matrix(df_pred['actual'], df_pred['predicted'])

array([[367,   4],
       [ 59,  11]], dtype=int64)

In [20]:
## ravel is used for flatening. the below order is followed in case of binary 

tn, fp, fn, tp = confusion_matrix(df_pred['actual'], df_pred['predicted']).ravel()
print(tn, fp, fn, tp)

367 4 59 11


In [21]:
from sklearn.metrics import classification_report
print(classification_report(df_pred['actual'], df_pred['predicted']))

             precision    recall  f1-score   support

          0       0.86      0.99      0.92       371
          1       0.73      0.16      0.26        70

avg / total       0.84      0.86      0.82       441



# ADAPTIVE BOOST (AdaBoost)

In [33]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=100)
model.fit(train_x, train_y)

test_pred = model.predict(test_x)
tn, fp, fn, tp = confusion_matrix(test_y, test_pred).ravel()
tn, fp, fn, tp

(352, 19, 43, 27)

In [34]:
accuracy = (tp+tn) / (tp+tn+fp+fn) * 100
accuracy

85.94104308390023

In [35]:
sensitivity = tp / (tp+fn) * 100
sensitivity

38.57142857142858

In [36]:
specificity = tn / (fp+tn) * 100
specificity

94.87870619946092

# Weighted Sampling

In [41]:
pd.Series(np.random.randint(1,100, 100)).value_counts().shape

(64,)

In [42]:
np.random.choice([1,2,3], p=[0.1, 0.3, 0.6])

3

In [50]:
random_samples = []
for i in range(10000):
    sample = np.random.choice([1,2,3], p=[0.1, 0.3, 0.6])
    random_samples.append(sample)
pd.Series(random_samples).value_counts()

3    6041
2    3043
1     916
dtype: int64

# Classification Error Rate

In [61]:
sample_dt1 = train.iloc[np.random.randint(1, train.shape[0], train.shape[0])]

all_cols = np.array(train.drop('Attrition', axis = 1).columns)
cols_position = np.random.randint(1, len(all_cols), 3)
random_cols = all_cols[cols_position]

sample_dt1_x = sample_dt1[random_cols]
sample_dt1_y = sample_dt1['Attrition']
random_cols


array(['StandardHours', 'JobRole_Research Director', 'StandardHours'],
      dtype=object)

In [60]:
pd.Series(sample_dt1.index).value_counts().head()

1196    6
1223    6
1050    5
843     5
449     5
dtype: int64

In [63]:
dt1 = DecisionTreeClassifier(max_depth=1)
dt1.fit(sample_dt1_x, sample_dt1_y)
test_pred = dt1.predict(test_x[random_cols])

df_pred = pd.DataFrame({'actual': test_y, 'predicted': test_pred})
df_pred

Unnamed: 0,actual,predicted
880,0,0
152,0,0
1466,0,0
1084,0,0
1086,0,0
1392,0,0
57,0,0
956,0,0
1400,0,0
1175,0,0


In [None]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(test_y, test_pred).ravel()
