In [None]:
import pandas as pd # package for high-performance, easy-to-use data 
#structures and data analysis
import numpy as np # fundamental package for scientific computing with Python
import matplotlib
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
import missingno as msno #checking missing values
color = sns.color_palette()
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.offline as offline
offline.init_notebook_mode()
from pylab import rcParams


from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, learning_curve, train_test_split
from sklearn.metrics import precision_score, roc_auc_score, recall_score, confusion_matrix, roc_curve, precision_recall_curve, accuracy_score

# import cufflinks and offline mode
import cufflinks as cf
cf.go_offline()

# from sklearn import preprocessing
# # Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings("ignore")


In [None]:
data=pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
data.head(5)

In [None]:
data.info()
data.describe().T

# Observations:
1. The average age of employees at IBM is 39, which means while hiring, they prefer candidates with decent work experience and expect higher level of expertise.
2. The average salary hike for employees is 15% with maximum being 25%. With decent salary hike in the organisation, employees tend to stay longer at the company and tend to enjoy long-term benefits with job security. This means, IBM rewards it's employees for their performance. This is proporational to employee satisfaction.
3. However , the average Employee satisfaction stands at 2.7 out of 5.
4. Most of the employees who get into IBM have worked with 2 or 3 companies in the past.
5. On an average, an employee has worked at IBM for around 11 years and there seems to be an outlier - wherein an employee has worked for 38 years.
6. It takes around 2 years for an IBM employee to bag his/her next promotion at the workplace.

In [None]:
msno.bar(data, color = 'r', figsize = (10,8))   

In [None]:
#Reassign target
data.Attrition.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)
# Drop useless feat
data = data.drop(columns=['StandardHours', 
                          'EmployeeCount', 
                          'Over18',
                        ])
data.head(5)

In [None]:
attrition = data[(data['Attrition'] != 0)]
no_attrition = data[(data['Attrition'] == 0)]

#COUNT
trace = go.Bar(x = (len(attrition), len(no_attrition)), y = ['Yes_attrition', 'No_attrition'], orientation = 'h', opacity = 0.8, marker=dict(
        color=['gold', 'lightskyblue'],
        line=dict(color='#000000',width=1.5)))

layout = dict(title =  'Attrition Count')
                    
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

#PERCENTAGE
trace = go.Pie(labels = ['No_attrition', 'Yes_attrition'], values = data['Attrition'].value_counts(), 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['lightskyblue','gold'], 
                           line=dict(color='#000000', width=1.5)))


layout = dict(title =  'Attrition Distribution')
           
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

# Observations:
1. In Attribution distribution diagram, it can be seem around 83.9% or 1233 (out of 1470 employees) dont think of leaving the organisation or are not at the risk of losing their employment.
2. Around 16.1% or 237 (out of 1470 employees) are either thinking of leaving the organisation or are at the risk of losing their employment.

In [None]:
plt.figure(figsize=(20,20))
corr = data.corr()
#Plot figsize
fig, ax = plt.subplots(figsize=(20,20))
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr,  cmap="RdYlGn", annot=True, fmt=".2f")
#Apply xticks
plt.xticks(range(len(corr.columns)), corr.columns);
#Apply yticks
plt.yticks(range(len(corr.columns)), corr.columns)
#show plot
plt.show()

# Observations from co-relation matrix:
1. Age and Total working years seem to have a good correlation of around 68%. 
2. Job level and Monthly income have around 95% co-relation ,which is evident. It also has around 51% correlation with Age, 78% with Total Working years and 53% with years spent in the company.
3. Percent Salary hike and Performance Rating have around 7&%.
4. years at the company seem to have a strong 77% co-relation with Years in current role and Years with current manager.


# let's have a look at numerical and categorical types

In [None]:
data_num=data.select_dtypes(include='number')
data_num.head()

In [None]:
data_obj=data.select_dtypes(include='object')
data_obj.head()

# Visualisations 

# 1. What is the age range of employees at IBM ?

In [None]:
plt.subplots(figsize=(10,10))
sns.countplot(data.Age)

# 2. What is the relation between Age and monthly income ?

In [None]:
sns.scatterplot(x='Age',y='MonthlyIncome',data=data)

# 3. What is the relation between performance rating and attrition ?

In [None]:
sns.countplot(x='Attrition',hue='PerformanceRating',data=data)

# 4. What level of education does IBM employees generally have ?

In [None]:
plt.subplots(figsize=(10,8))
sns.countplot(data.Education)

# 5. With how many companies have the employees worked in the past?

In [None]:
plt.subplots(figsize=(10,8))
sns.countplot(data.NumCompaniesWorked)

# 6. How many employees receive what percent salary hike at IBM ?

In [None]:
plt.subplots(figsize=(10,8))
sns.countplot(data.PercentSalaryHike)

# 7. Is business travelling a part of work life at IBM ?

In [None]:
plt.subplots(figsize=(6,8))
sns.countplot(x='BusinessTravel', data=data)

# 8. What is the relationship status of IBM employees in general ?

In [None]:
plt.subplots(figsize=(10,8))
sns.countplot(x='MaritalStatus', data=data)

# 50% of employees from each gender are Divorced 

In [None]:
print(data.groupby(['Gender','MaritalStatus'])['MaritalStatus'].count())
print(data.groupby('Gender')['Gender'].count())

# 9. What are the most active job roles at IBM ?


In [None]:
plt.figure(figsize=(8,8))
plt.pie(data['JobRole'].value_counts(),labels=data['JobRole'].value_counts().index,autopct='%.2f%%');
plt.title('Job Role Distribution',fontdict={'fontsize':22});

# 10. Does a particular Gender dominate a Job role ?

In [None]:
plt.subplots(figsize=(10,8))
fig = plt.gcf()
fig.set_size_inches(20,14)
sns.countplot(x='JobRole', hue='Gender',data=data)
plt.title('Job Role Between Male and Female')

# 11. Which education field is commonly noticed amongst IBM employees?

In [None]:
plt.figure(figsize=(8,8))
plt.pie(data['EducationField'].value_counts(),labels=data['EducationField'].value_counts().index,autopct='%.2f%%')

# 12. Which department has maximum employees employed with them ?

In [None]:
plt.subplots(figsize=(10,8))
sns.countplot(x='Department', data=data)

# 13. Which department witnesses maximum Attrition ?

In [None]:
plt.subplots(figsize=(10,8))
sns.countplot(x='Department', hue='Attrition',data=data)

# 14. Which age range demands what kind of Salary hike ?

In [None]:
plt.figure(figsize=(12, 9))
sns.boxplot(x='PercentSalaryHike',y='Age',data=data,palette='winter')

# 15. What is the montly income as per the job role ?

In [None]:
sns.set(font_scale=1)
sns.boxplot(x='JobRole',y='MonthlyIncome',data=data)
plt.xticks(rotation=90)

# 16. What is the monthly income as per the Education field ?

In [None]:
sns.boxplot(x='EducationField',y='MonthlyIncome',data=data)
plt.xticks(rotation=90)

# Checking co-relation of Attrition with other attributes :

In [None]:
data.corr()['Attrition'].sort_values(ascending=False)

# Critical attributes w.r.t Job role

In [None]:
data.groupby(by='JobRole')["PercentSalaryHike","YearsAtCompany","TotalWorkingYears","YearsInCurrentRole","WorkLifeBalance"].mean()

# Encoding the categorical columns  

In [None]:
data.head()

In [None]:
data_obj=data.select_dtypes(include='object')
data_obj.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le= LabelEncoder()

In [None]:
categorical_col=[]
for col in data.columns:
    if data[col].dtype== object and data[col].nunique()<=50:
        categorical_col.append(col)
print(categorical_col)

In [None]:
for col in categorical_col:
    data[col]=le.fit_transform(data[col])

In [None]:
data.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X= data.drop('Attrition',axis=1)
y=data['Attrition']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 33)
print("Train Set Size : ",X_train.shape)
print("Train Target Set Size : ",y_train.shape)
print("Test  Set Size : ",X_test.shape)
print("Test  Target Set Size : ",y_test.shape)

# Decision Tree Classifier :

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model= DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
dir(model) #to select which all parameters are important to us

In [None]:
pred= model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,pred))

In [None]:
print(confusion_matrix(y_test,pred))

NOW, WE WILL BE TUNING THE HYPERPARAMETERS OF DECISION TREE USING RANDOMIZED SEARCH CROSS VALIDATION METHOD FOR IMPROVING THE ACCURACY OF THE MODEL.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params={"criterion":("gini", "entropy"),
        "splitter":("best", "random"), 
        "max_depth":(list(range(1, 20))), 
        "min_samples_split":[2, 3, 4], 
        "min_samples_leaf":list(range(1, 20))}

In [None]:
tree_randomized= RandomizedSearchCV(model,params,n_iter=100,n_jobs=-1,cv=5,verbose=2)

In [None]:
tree_randomized.fit(X_train,y_train)

In [None]:
tree_randomized.best_estimator_

In [None]:
model=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=11, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='random')

In [None]:
model.fit(X_train,y_train)
pred=model.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

In [None]:
print(confusion_matrix(y_test,pred))

# Random Forest Classifier :

In [None]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor

In [None]:
print(RandomForestClassifier())
print(RandomForestRegressor()) #check HP we can tune

#n_estimators (how many indiviual trees can be built) and max depth(how deep can the tree go ) to consider

In [None]:
from sklearn.ensemble import RandomForestClassifier 
import pandas as pd
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning) #to let us that the default value for gridsearch is going to change in future release
warnings.filterwarnings('ignore', category=DeprecationWarning) #to let us know tyhe beahviour of gridsearchcv within test

In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5,50,250], 'max_depth':[2,4,8,16,32,None] #none will let it go as deep as it want
}

cv = GridSearchCV(rf, parameters, cv=5) #(modelobject, parameter dictionary, how many folds we want cv=5)
cv.fit(X_train,y_train.values.ravel()) #training lables are stored as vector type, but we need array , hence .ravel()

print_results(cv)

In [None]:
cv.best_estimator_

In [None]:
rf= RandomForestClassifier(n_estimators=50,max_depth=32)
rf.fit(X_train,y_train)
rf_pred= rf.predict(X_test)
print(classification_report(y_test,rf_pred))

In [None]:
print(confusion_matrix(y_test,rf_pred))

# Support Vector Machine 

In [None]:
from sklearn.svm import SVC

In [None]:
SVC()# we only select ones that are imp - C and kernel

In [None]:
dir(SVC)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 101)

In [None]:
clf = svm.SVC()
clf.fit(X_train,y_train)

print('Accuracy of SVC on training set: {:.2f}'.format(clf.score(X_train, y_train) * 100))

print('Accuracy of SVC on test set: {:.2f}'.format(clf.score(X_test, y_test) * 100))

# Applying Linear SVM

In [None]:
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train)
accuracy_lin_train = linear.score(X_train, y_train)
accuracy_lin_test = linear.score(X_test, y_test)
print('Accuracy Linear Kernel on training set:', accuracy_lin_train*100)
print('Accuracy Linear Kernel on testing set:', accuracy_lin_test*100)

# Appyling RBF SVM

In [None]:
rbf = svm.SVC(kernel='rbf', gamma=0.1, C=0.1, decision_function_shape='ovo').fit(X_train, y_train)
accuracy_rbf_train = rbf.score(X_train, y_train)
accuracy_rbf_test = rbf.score(X_test, y_test)
print('Accuracy Radial Basis Kernel on training set:', accuracy_rbf_train*100)
print('Accuracy Radial Basis Kernel on testing set:', accuracy_rbf_test*100)

# Grid Search 

In [None]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [None]:
# May take awhile!
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid_predictions = grid.predict(X_test)

In [None]:
print(confusion_matrix(y_test,grid_predictions))

In [None]:
print(classification_report(y_test,grid_predictions))

# Henceforth, we will be choosing only a few important attributes and check them on other algorithms

In [None]:
data.columns

In [None]:
X= data.drop(['Attrition','BusinessTravel','DailyRate','Department','DistanceFromHome','Education','EmployeeNumber','Gender',
             'HourlyRate','JobInvolvement','JobLevel','JobRole','JobSatisfaction','MaritalStatus',
             'MonthlyRate','NumCompaniesWorked','OverTime','RelationshipSatisfaction','StockOptionLevel',
              'TrainingTimesLastYear'],axis=1)
y=data['Attrition']

In [None]:
data.head()

In [None]:
X.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 101)
print("Train Set Size : ",X_train.shape)
print("Train Target Set Size : ",y_train.shape)
print("Test  Set Size : ",X_test.shape)
print("Test  Target Set Size : ",y_test.shape)

In [None]:
# Applying Scaling Standardiztion to all of the features in order to bring them into common scale .
# Standardiztion : is preferred when most of the featues are not following gaussian distribution . 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train))
X_test  = pd.DataFrame(sc.fit_transform(X_test))

# Logistic Regression:

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 42 )

# Setting Parameters for Logistic Regression . 

params = {    # Regularization Params
             'penalty' : ['l1','l2','elasticnet'],
              # Lambda Value 
             'C' : [0.01,0.1,1,10,100]
         }

log_reg = GridSearchCV(lr,param_grid = params,cv = 10)
log_reg.fit(X_train,y_train)
log_reg.best_params_

In [None]:
# Make Prediction of test data 
y_pred = log_reg.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
plt.rcParams['figure.figsize'] = (6,4)
class_names = [1,0]
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

#create a heat map
sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)), annot = True, cmap = 'BuGn_r',
           fmt = 'g')
plt.tight_layout()
plt.title('Confusion matrix for Logistic Regression  Model', y = 1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (10,6)

# Get predicted probabilites from the model
y_proba = log_reg.predict_proba(X_test)[:,1]

# display auc value for log_reg
auc_log_reg = roc_auc_score(y_test,y_pred)
print("roc_auc_score value for log reg is : ",roc_auc_score(y_test,y_pred))

# Create true and false positive rates
fpr_log_reg,tpr_log_reg,thershold_log_reg_model = roc_curve(y_test,y_proba)
plt.plot(fpr_log_reg,tpr_log_reg)
plt.plot([0,1],ls='--')
#plt.plot([0,0],[1,0],c='.5')
#plt.plot([1,1],c='.5')
plt.title('Reciever Operating Characterstic For Logistic Regregression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier 

dt = DecisionTreeClassifier(random_state = 42)


# Setting Parameters for DecisionTreeClassifier . 

params = {  
             'criterion'    : ["gini", "entropy"],
             'max_features' : ["auto", "sqrt", "log2"],
              'min_samples_split' :[i for i in range(4,16)],
              'min_samples_leaf' : [i for i in range(4,16)]
         }

dt_clf = GridSearchCV(dt,param_grid = params,cv = 10)
dt_clf.fit(X_train,y_train)
dt_clf.best_params_

In [None]:
# Make Prediction of test data 
y_pred = dt_clf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
plt.rcParams['figure.figsize'] = (6,4)
class_names = [1,0]
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

#create a heat map
sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)), annot = True, cmap = 'BuGn_r',
           fmt = 'g')
plt.tight_layout()
plt.title('Confusion matrix for DecisionTreeClassifier   Model', y = 1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (10,6)

# Get predicted probabilites from the model
y_proba = dt_clf.predict_proba(X_test)[:,1]

dt_clf_auc_score = roc_auc_score(y_test,y_pred)
# display auc value for DecisionTreeClassifier
print("roc_auc_score value for log reg is : ",roc_auc_score(y_test,y_pred))

# Create true and false positive rates
fpr_dt_clf,tpr_dt_clf,thershold_dt_clf_model = roc_curve(y_test,y_proba)
plt.plot(fpr_dt_clf,tpr_dt_clf)
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.title('Reciever Operating Characterstic For DecisionTreeClassifier ')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# Random Forest Classifier:

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators = 150,min_samples_split = 20,min_samples_leaf = 5,random_state = 42)
rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_test)


In [None]:
# Make Prediction of test data 
y_pred = rf_clf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
plt.rcParams['figure.figsize'] = (6,4)
class_names = [1,0]
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

#create a heat map
sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)), annot = True, cmap = 'BuGn_r',
           fmt = 'g')
plt.tight_layout()
plt.title('Confusion matrix for RandomForestClassifier   Model', y = 1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
plt.rcParams['figure.figsize'] = (10,6)

# Get predicted probabilites from the model
y_proba = dt_clf.predict_proba(X_test)[:,1]

rf_auc_score = roc_auc_score(y_test,y_pred)

# display auc value for RandomForestClassifier
print("roc_auc_score value for log reg is : ",roc_auc_score(y_test,y_pred))

# Create true and false positive rates
fpr_rf_clf,tpr_rf_clf,thershold_rf_clf_model = roc_curve(y_test,y_proba)
plt.plot(fpr_rf_clf,tpr_rf_clf)
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.title('Reciever Operating Characterstic For RandomForestClassifier ')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# KNN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs = -1)

# set params

params = {
             "n_neighbors" : [i for i in range(15)],
               'p' : [1,2] ,
              'leaf_size' : [i for i in range(15)],
               
          }
knn = GridSearchCV(knn,param_grid = params, cv = 5)
knn.fit(X_train,y_train)
knn.best_params_

In [None]:
# Make Prediction of test data 
y_pred = knn.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
plt.rcParams['figure.figsize'] = (6,4)
class_names = [1,0]
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

#create a heat map
sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)), annot = True, cmap = 'BuGn_r',
           fmt = 'g')
plt.tight_layout()
plt.title('Confusion matrix for KNN Algorithm   Model', y = 1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
plt.rcParams['figure.figsize'] = (10,6)

# Get predicted probabilites from the model
y_proba = knn.predict_proba(X_test)[:,1]

knn_auc_score = roc_auc_score(y_test,y_pred)


# display auc value for KNN Algorithm
print("roc_auc_score value for log reg is : ",roc_auc_score(y_test,y_pred))

# Create true and false positive rates
fpr_KNN,tpr_KNN,thershold_KNN_model = roc_curve(y_test,y_proba)
plt.plot(fpr_KNN,tpr_KNN)
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.title('Reciever Operating Characterstic For KNN Algorithm ')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.title('Reciever Operating Characterstic Curve')
plt.plot(fpr_log_reg,tpr_log_reg,label='LogisticRegression')
plt.plot(fpr_dt_clf,tpr_dt_clf,label='DecisionTreeClassifier')
plt.plot(fpr_rf_clf,tpr_rf_clf,label='RandomForestClassifier')
plt.plot(fpr_KNN,tpr_KNN,label='KNearestNeighbors ')
plt.plot([0,1],ls='--')
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.legend()
plt.show()

In [None]:
print("Area Under Curve Score values for Different algorithms : ")
print("LogisticRegression          : ",auc_log_reg)
print("DecisionTreeClassfier       : ",dt_clf_auc_score)
print("RandomForest Classifier     : ",rf_auc_score)
print("KnearestNeighborsClassifier : ",knn_auc_score)

# SVM:

In [None]:
from sklearn.svm import SVC

In [None]:
scaler=StandardScaler()
scaled_data=scaler.fit_transform(data.drop('Attrition',axis=1))
X=scaled_data
y=data['Attrition']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
SVC()# we only select ones that are imp - C and kernel

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:

clf = svm.SVC()
clf.fit(X_train,y_train)

print('Accuracy of SVC on training set: {:.2f}'.format(clf.score(X_train, y_train) * 100))

print('Accuracy of SVC on test set: {:.2f}'.format(clf.score(X_test, y_test) * 100))


# Grid Search

In [None]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1,1,10,100], 'gamma':[1,0.1,0.01,0.001]}

In [None]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose=3)
grid.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt
import numpy as np



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=150)

clf = svm.SVC(C=1,gamma=0.01)
clf.fit(X_train,y_train)

print('Accuracy of SVC on training set: {:.2f}'.format(clf.score(X_train, y_train) * 100))

print('Accuracy of SVC on test set: {:.2f}'.format(clf.score(X_test, y_test) * 100))

# Grid Search on Linear SVM

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1,1,10,100], 'gamma':[1,0.1,0.01,0.001]}

In [None]:
grid = GridSearchCV(SVC(kernel='linear'), param_grid, refit = True, verbose=3)
grid.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt
import numpy as np



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=150)

clf = svm.SVC(kernel='linear',C=1,gamma=0.01)
clf.fit(X_train,y_train)

print('Accuracy of SVC on training set: {:.2f}'.format(clf.score(X_train, y_train) * 100))

print('Accuracy of SVC on test set: {:.2f}'.format(clf.score(X_test, y_test) * 100))

# Grid Search on rbf SVM

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf')
svm.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1,1,10,100], 'gamma':[1,0.1,0.01,0.001]}

In [None]:
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, refit = True, verbose=3)
grid.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt
import numpy as np



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=150)

clf = svm.SVC(kernel='rbf',C=1,gamma=0.01)
clf.fit(X_train,y_train)

print('Accuracy of SVC on training set: {:.2f}'.format(clf.score(X_train, y_train) * 100))

print('Accuracy of SVC on test set: {:.2f}'.format(clf.score(X_test, y_test) * 100))

# Apply Gram Matrix

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt
import numpy as np



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=150)

In [None]:
clf = svm.SVC(kernel='precomputed')

In [None]:
gram_train = np.dot(X_train, X_train.T)
clf.fit(gram_train, y_train)

In [None]:
gram_test = np.dot(X_test, X_train.T)
clf.predict(gram_test)

In [None]:
print('Accuracy of SVC on training set: {:.2f}'.format(clf.score(gram_train, y_train) * 100))
print('Accuracy of SVC on training set: {:.2f}'.format(clf.score(gram_test, y_test) * 100))

In [None]:
plt.rcParams['figure.figsize'] = (6,4)
class_names = [1,0]
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

#create a heat map
sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)), annot = True, cmap = 'BuGn_r',
           fmt = 'g')
plt.tight_layout()
plt.title('Confusion matrix for Logistic Regression  Model', y = 1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# Gradient Boosting Classifier:

In [None]:
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor

In [None]:
print(GradientBoostingClassifier())
print(GradientBoostingRegressor())

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore', category=FutureWarning) #to let us that the default value for gridsearch is going to change in future release
warnings.filterwarnings('ignore', category=DeprecationWarning) #to let us know tyhe beahviour of gridsearchcv within test


In [None]:
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5,50,250,500], 'max_depth':[2,4,8,16,32],'learning_rate': [0.01,0.1,1,10,100]
}


cv = GridSearchCV(gb, parameters, cv=5) #(modelobject, parameter dictionary, how many folds we want cv=5)
cv.fit(X_train,y_train.values.ravel()) #training lables are stored as vector type, but we need array , hence .ravel()

print_results(cv)

In [None]:
cv.best_estimator_