In [1]:
from sklearn.ensemble import GradientBoostingClassifier  
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn import metrics                         
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [2]:
#Read the train file:

file=pd.read_csv("dataset.csv")
print(file.head())

   Age  Attrition     BusinessTravel  DailyRate              Department  \
0   41          1      Travel_Rarely       1102                   Sales   
1   49          0  Travel_Frequently        279  Research & Development   
2   37          1      Travel_Rarely       1373  Research & Development   
3   33          0  Travel_Frequently       1392  Research & Development   
4   27          0      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

  ...  RelationshipSatisfaction StockOptionLevel  TotalWorkingYears 

In [3]:
#Remove the unneccessary features and target variable

a1=file.drop("Attrition",axis=1)
a2=a1.drop("EmployeeCount",axis=1)
a3=a2.drop("EmployeeNumber",axis=1)
a4=a3.drop("ID",axis=1)

#Convert the categorical features into numerical ones

df = pd.get_dummies(a4) 
print(df)

      Age  DailyRate  DistanceFromHome  Education  EnvironmentSatisfaction  \
0      41       1102                 1          2                        2   
1      49        279                 8          1                        3   
2      37       1373                 2          2                        4   
3      33       1392                 3          4                        4   
4      27        591                 2          1                        1   
5      32       1005                 2          2                        4   
6      59       1324                 3          3                        3   
7      30       1358                24          1                        4   
8      38        216                23          3                        4   
9      36       1299                27          3                        3   
10     35        809                16          3                        1   
11     29        153                15          2               

In [4]:
#Step-1:

#Tune the n_estimators:
#n_jobs: no of jobs running in parallal

p_tuning1 = {'n_estimators':range(20,1000,10)}

#Fit a Gradient Boosting Classifier:

gs1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=50,max_depth= 5, min_samples_split=550,max_features='sqrt',subsample=0.8,random_state=42), 
param_grid = p_tuning1, scoring='roc_auc',n_jobs=5,cv=5)

gs1.fit(df[0:801],file["Attrition"][0:801])

print(f"The best metric is obtained at {gs1.best_params_} and the value of ROC-AUC is {gs1.best_score_}")

The best metric is obtained at {'n_estimators': 20} and the value of ROC-AUC is 0.5


In [5]:
#Step-2
#Tune max_depth and min_samples_split simultaneously:

p_tuning2 = {'max_depth':range(5,51,5), 'min_samples_split':range(200,1001,200)}

#Fit a Gradient Boosting Classifier:

gs2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,n_estimators=20,min_samples_leaf=50,max_features='sqrt',subsample=0.8,random_state=42), 
param_grid = p_tuning2, scoring='roc_auc',n_jobs=5, cv=5)

gs2.fit(df[0:801],file["Attrition"][0:801])

print(f"The best metric is obtained at {gs2.best_params_} and the value of the ROC-AUC is is {gs2.best_score_}")

The best metric is obtained at {'max_depth': 5, 'min_samples_split': 200} and the value of the ROC-AUC is is 0.7801015230544108


In [6]:
#Step-3
#Tune min_samples leaf

p_tuning3 = {'min_samples_leaf':range(1,1001,50),'subsample':[0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95]}

gs3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=330,max_depth=5,max_features=5, subsample=0.8, random_state=42,min_samples_split=200), 
param_grid = p_tuning3, scoring='roc_auc',n_jobs=5,cv=5)

#Fit the classifier

gs3.fit(df[0:801],file["Attrition"][0:801])

print(f"The best metric is obtained at {gs3.best_params_} and the value of the ROC-AUC is is {gs3.best_score_}")

The best metric is obtained at {'min_samples_leaf': 1, 'subsample': 0.6} and the value of the ROC-AUC is is 0.838703778450148


In [7]:
#Step-4:
#Tune max_features and learning_rate

p_tuning4 = {'max_features':range(5,21,2),'learning_rate':[.001,.01,.1,1,10,100]}
gs4 = GridSearchCV(estimator = GradientBoostingClassifier( n_estimators=330,max_depth=5, min_samples_split=200, min_samples_leaf=1, subsample=0.6, random_state=42),
param_grid = p_tuning4, scoring='roc_auc',n_jobs=5,cv=5)

#fit the classifier

gs4.fit(df[0:801],file["Attrition"][0:801])

print(f"The best metric is obtained at {gs4.best_params_} and the value of the ROC-AUC is is {gs4.best_score_}")





The best metric is obtained at {'learning_rate': 0.1, 'max_features': 5} and the value of the ROC-AUC is is 0.838703778450148


In [8]:
#Apply the tuned classifier on the test data

#preprocessing on the test data


s=df[801:]

#fit the optimal classifier in the test data
classifier_optimal=GradientBoostingClassifier(learning_rate=0.1, n_estimators=330,max_depth=5,min_samples_split=200, subsample=0.60, min_samples_leaf=1, random_state=42,max_features=5).fit(df,file["Attrition"])

k1=classifier_optimal.predict(s)
#df2=pd.DataFrame(k1)

#print the predictions
print(k1)
from sklearn.metrics import accuracy_score
print(accuracy_score(file["Attrition"][801:],k1))

#convert it to the csv file
#output = pd.DataFrame({'ID': s1.ID, 'Attrition': k1})
#output.to_csv('output.csv', index=False)

[1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0
 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 0]
0.973568281938326
