<a href="https://colab.research.google.com/github/prateekkosta/Machine-Learning-Models/blob/main/Ada_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import make_scorer, fbeta_score, accuracy_score
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
hr_data= pd.read_csv('HR_comma_sep.csv')
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary,left
0,0.38,0.53,2,157,3,0,0,sales,low,1
1,0.8,0.86,5,262,6,0,0,sales,medium,1
2,0.11,0.88,7,272,4,0,0,sales,medium,1
3,0.72,0.87,5,223,5,0,0,sales,low,1
4,0.37,0.52,2,159,3,0,0,sales,low,1


**satisfaction_level**: Value between 0 to 1

**last_evaluation**: Value between 0 to 1

**number_project**: No. of projects the employee has worked on

**average_monthy_hours**: Average hours an employee works per month

**time_spend_company**: No. of years spent in a company

**Work_accident**: Boolean value 0 or 1 indicating if an employee had accident

**promotion_last_5years**: Boolean value 0 or 1 indicating if an employee was promoted

**left**: Boolean value 0 or 1 indicating if an employee left the company







In [None]:
display(hr_data.describe())

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.021268,0.238083
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.144281,0.425924
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [None]:
hr_data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'department', 'salary', 'left'],
      dtype='object')

In [None]:
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   promotion_last_5years  14999 non-null  int64  
 7   department             14999 non-null  object 
 8   salary                 14999 non-null  object 
 9   left                   14999 non-null  int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


Analysis of employes left the company

In [None]:
count_left=  np.count_nonzero(hr_data.left== 1)
count_promoted= np.count_nonzero( hr_data.promotion_last_5years== 1)
count_accident= np.count_nonzero( hr_data.Work_accident== 1)
left_percent= float(count_left)/ float(hr_data.shape[0])*100

print('we observed number of %s people that left the company'% count_left)
print('we observed that percent of %s people left the company' % left_percent)

we observed number of 3571 people that left the company
we observed that percent of 23.80825388359224 people left the company


In [None]:
# Seperating the Target Variable and Features
target=  hr_data['left']
features= hr_data.drop('left', axis= 1)

Creating Dummy Variables

In [None]:
hr_data2= pd.get_dummies(hr_data)
encoded= list(hr_data2.columns)

print(encoded)

['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'left', 'department_IT', 'department_RandD', 'department_accounting', 'department_hr', 'department_management', 'department_marketing', 'department_product_mng', 'department_sales', 'department_support', 'department_technical', 'salary_high', 'salary_low', 'salary_medium']


In [None]:
# Dropping extra variaables from Data

hr_data2= hr_data2.drop(['salary_medium'], axis= 1)
hr_data2= hr_data2.drop(['department_technical'], axis= 1)

In [None]:
hr_data2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,salary_high,salary_low
0,0.38,0.53,2,157,3,0,0,1,0,0,0,0,0,0,0,1,0,0,1
1,0.8,0.86,5,262,6,0,0,1,0,0,0,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,0,1,0,0,0,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,0,1,0,0,0,0,0,0,0,1,0,0,1
4,0.37,0.52,2,159,3,0,0,1,0,0,0,0,0,0,0,1,0,0,1


In [None]:
X= hr_data2
Y= hr_data2.pop('left')

In [None]:
X_train, X_test,  Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=42)

print('Training set has {} samples.'.format(X_train.shape[0]))
print('testing set has {} samples.'.format(X_test.shape[0]))

Training set has 10499 samples.
testing set has 4500 samples.


In [None]:
ada_boost_model= AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=20, random_state=100)

ada_boost_model.fit(X_train, Y_train)

train_predict= (ada_boost_model.predict(X_train))
test_predict= (ada_boost_model.predict(X_test))

print('Accuracy on training data :{0:.4f}'.format(accuracy_score(Y_train, train_predict )))
print('accuracy on test data:{0:.4f}'.format(accuracy_score(Y_test, test_predict )))

Accuracy on training data :1.0000
accuracy on test data:0.9758
