In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
hr = pd.read_csv('D:/Downloaded rcodes and datasets/HR Analytics.csv')
hr.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,Sales,1,2,Life Sciences,1,2,Female,...,8,11,3,1,8,1,6,4,0,5
1,49,0,Travel_Frequently,Research & Development,8,1,Life Sciences,2,3,Male,...,1,23,4,4,10,3,10,7,1,7


In [3]:
bins = [0,20,40,60,80]
np.digitize([-10], bins = bins)
np.digitize([100], bins = bins)
np.digitize([20], bins = bins)
np.digitize([40], bins = bins)

array([3], dtype=int64)

In [4]:
hr['Age_Bin'] = np.digitize(hr['Age'], bins = bins)
hr[['Age', 'Age_Bin']].head()

#hr['Age_Bin'] = hr['Age_Bin'].replace({0:'B0',1:'B1',2:'B2',3:'B3',4:'B4'})
#hr[['Age', 'Age_Bin']].head()


Unnamed: 0,Age,Age_Bin
0,41,3
1,49,3
2,37,2
3,33,2
4,27,2


# Dictionary Comprehension

In [5]:
# Automating the replace process
# Dictionary comprehension

bin_labels = {x: 'B' + str(x) for x in range(6)}
hr['Age_Bin'] = hr['Age_Bin'].replace(bin_labels)

In [6]:
hr[['Age', 'Age_Bin']].head()

Unnamed: 0,Age,Age_Bin
0,41,B3
1,49,B3
2,37,B2
3,33,B2
4,27,B2


In [7]:
# Creating the bins
# Quantiles

bins = hr['Age'].quantile([0, 0.25, 0.5, 0.75, 1]).values
bin_labels = {x: 'B' + str(x) for x in range(len(bins))}
hr['Age_Bin'] = hr['Age_Bin'].replace(bin_labels)
hr[['Age','Age_Bin']].head()

Unnamed: 0,Age,Age_Bin
0,41,B3
1,49,B3
2,37,B2
3,33,B2
4,27,B2


In [8]:
hr._get_numeric_data().columns.drop('Attrition')

Index(['Age', 'DistanceFromHome', 'Education', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction',
       'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'TotalWorkingYears',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [9]:
def numeric2category(numeric_series):
    bins = numeric_series.quantile([0,0.25,0.5,0.75,1]).values
    bin_labels = {x: 'B' + str(x) for x in range(len(bins)+1)}
    
    categories_array = np.digitize(numeric_series, bins = bins)
    
    category_series = pd.Series(categories_array)
    
    return (category_series.replace(bin_labels))

# numeric2category(hr['MonthlyIncome'])

In [10]:
num_cols = hr._get_numeric_data().columns.drop('Attrition')
df_categories = pd.DataFrame(columns = num_cols)

for col in num_cols:
    df_categories[col] = numeric2category(hr[col])
df_categories.head()    

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,B3,B1,B2,B1,B2,B4,B5,B3,B4,B1,B4,B1,B2,B1,B3,B3,B2,B3
1,B4,B3,B1,B1,B3,B2,B2,B3,B2,B4,B5,B5,B3,B4,B4,B4,B3,B4
2,B3,B2,B2,B1,B5,B2,B3,B1,B4,B3,B4,B2,B2,B4,B1,B1,B2,B1
3,B2,B2,B4,B1,B5,B4,B3,B1,B2,B1,B4,B3,B2,B4,B3,B4,B4,B1
4,B1,B2,B1,B1,B1,B4,B2,B2,B5,B2,B4,B5,B2,B4,B1,B2,B3,B2


In [11]:
cat_cols = hr.columns.drop(num_cols)
cat_cols

Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus', 'Age_Bin'],
      dtype='object')

In [12]:
df_final = pd.concat([df_categories, hr[cat_cols]], axis = 1)
df_final.head()

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,...,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Age_Bin
0,B3,B1,B2,B1,B2,B4,B5,B3,B4,B1,...,B2,B3,1,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,B3
1,B4,B3,B1,B1,B3,B2,B2,B3,B2,B4,...,B3,B4,0,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,B3
2,B3,B2,B2,B1,B5,B2,B3,B1,B4,B3,...,B2,B1,1,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,B2
3,B2,B2,B4,B1,B5,B4,B3,B1,B2,B1,...,B4,B1,0,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,B2
4,B1,B2,B1,B1,B1,B4,B2,B2,B5,B2,...,B3,B2,0,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,B2


- Custom functions = user defined functions
- internal functions = built_in functions

# Naive-Baye's Classifier 

In [13]:
import numpy as np
import pandas as pd
hr = pd.read_csv('D:/Downloaded rcodes and datasets/HR Analytics.csv')

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train, test = train_test_split(hr, test_size = 0.3, random_state = 100) 

train_x = train.drop('Attrition', axis=1)
train_y = train['Attrition']
test_x = test.drop('Attrition', axis=1)
test_y = test['Attrition']
# Here we don't have to take dummies

In [16]:
pd.pivot_table(data=train, 
              index='Attrition',
              columns = 'Gender',
              values = 'MaritalStatus',
              aggfunc = np.count_nonzero)

Gender,Female,Male
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
0,357,505
1,62,105


In [17]:
pd.pivot_table(data=train, 
              index='Attrition',
              columns = 'Gender',
              values = 'MaritalStatus',
              aggfunc = len)

Gender,Female,Male
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
0,357,505
1,62,105


In [18]:
pd.crosstab(train['Attrition'],train['Gender'])

Gender,Female,Male
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1
0,357,505
1,62,105


In [19]:
pd.crosstab(train['Attrition'],train['MaritalStatus'])

MaritalStatus,Divorced,Married,Single
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,210,408,244
1,25,57,85


In [20]:
from sklearn.naive_bayes import GaussianNB


In [21]:
df_model = pd.get_dummies(df_final)

train, test = train_test_split(df_model, test_size = 0.3, random_state = 100) 

train_x = train.drop('Attrition', axis=1)
train_y = train['Attrition']
test_x = test.drop('Attrition', axis=1)
test_y = test['Attrition']

In [22]:
np.random.seed(100)
model = GaussianNB()
model.fit(train_x, train_y)

GaussianNB(priors=None)

In [23]:
test_pred = model.predict(test_x)

In [24]:
df_pred = pd.DataFrame({'actual':test_y,'predicted':test_pred})

In [25]:
df_pred['status'] = df_pred['actual'] == df_pred['predicted']

In [26]:
print(round(df_pred['status'].sum()/df_pred.shape[0] *100, 2) , '%')

41.04 %


# .....................................................................................................................

In [43]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import secrets
mult_nb=MultinomialNB()

mult_nb.fit(train_x,train_y)

pred_nb = mult_nb.predict(test_x)

accur_nb = secrets.model_testing(pred_nb,test_y)
accur_nb

Unnamed: 0,Accuracy,Sensitivity,Specificity
0,77.324263,32.954545,88.385269


In [39]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

binom_nb = BernoulliNB()
binom_nb.fit(train_x, train_y)
pred_bin = binom_nb.predict(test_x)

accu = accuracy_score(pred_bin, test_y)
accu

0.7437641723356009

In [44]:
import secrets

In [45]:
secrets.model_report1(hr,"Attrition")

Unnamed: 0,Decision_tree,Random_forest,Adaboost
tp,18.0,13.0,16.0
tn,322.0,357.0,357.0
fp,49.0,14.0,14.0
fn,52.0,57.0,54.0
accuracy,77.0975,83.9002,84.5805
sensitivity,25.7143,18.5714,22.8571
specificity,86.7925,96.2264,96.2264
