In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load The Dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
print('Shape is',df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
print('Shape is',df_test.shape)
df_test.head()

#### Let us combine both train and test set and then clean the data together

In [None]:
df_comb = pd.concat([df_train, df_test])
print('shape:',df_comb.shape)
df_comb.head()

In [None]:
df_comb.info()

### Null Values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15,8]
sns.heatmap(df_comb.isnull())
plt.show()

In [None]:
# Checking the percentage of null values in each column
((df_comb.isnull().sum()/df_comb.shape[0])*100).sort_values(ascending = False)

### Inference
* As we can see there are many missing values in the data.
* We will have to treat these using apropriate methods.

## Univariate Analysis

##### Company Type

In [None]:
df_comb['company_type'].value_counts(dropna=False)

In [None]:
sns.countplot(df_comb['company_type']);

### Inference
* We have most of the data pf private companies.
* Comparatively we have less data of all other types of companies like Funded start up, NGO, etc.

##### Company Size

In [None]:
df_comb['company_size'].value_counts()

### Feature Engineering

In [None]:
## Let's bin these values

df_comb['company_size'].replace({'<10': 'Very_small_size', '10/49':'Very_small_size', '50-99':'Very_small_size',
'100-500': 'Small_Org', '500-999':'Small_Org',
'1000-4999':'Medium_Org', '5000-9999': 'Medium_Org',
'10000+': 'Large_Org'},inplace = True)

df_comb['company_size'].value_counts()

#### Checking the relationship between company size and company type

In [None]:
pd.crosstab(df_comb['company_type'],df_comb['company_size']).plot(kind = 'bar');

### Inference
* There is definte relationship between company type and company size.

##### City Column

In [None]:
df_comb['city'].value_counts().sort_values(ascending = False).head(8).plot(kind = 'barh', color = 'red')
plt.show()

### Inference

* Most of the data scientists are from city_103.


##### city_development_index

In [None]:
df_comb['city_development_index'].plot(kind = 'kde');

##### Gender

In [None]:
df_comb["gender"].value_counts(dropna = False)

In [None]:
df_unk=df_comb[['gender','target']].fillna(value= 'Unknown')

In [None]:
df_unk.groupby(by = 'gender')['target'].value_counts()

##### Column : Experience

In [None]:
df_comb["experience"].value_counts(dropna = False)

* Let's make the column numerical by replacing the strings in the values.

In [None]:
df_comb["experience"].replace({'>20': 21, '<1':0},inplace = True)

##### enrolled_university

In [None]:
df_comb['enrolled_university'].value_counts().plot(kind = 'bar');

### Inference
* We have more data of the people who have not enrolled in any university.
* And the least data of the people who have enrolled in part time course.

##### education_level

In [None]:
sns.countplot(df_comb['education_level']);

### Inference
* We have more data of the graduate students.

##### major_discipline

In [None]:
df_comb['major_discipline'].value_counts().plot(kind = 'barh');

### Inference
* In the major discipline section, we have more data of STEM.

#### Last new JOB

In [None]:
df_comb['last_new_job'].value_counts()

* We will replace the strings into numbers to make the column numerical

In [None]:
def replace(last_new_job):
    if last_new_job == '>4':
        return 5
    elif last_new_job == 'never':
        return 0

    else:
        return last_new_job

df_comb.last_new_job = df_comb.last_new_job.map(replace)
df_comb['last_new_job'].unique()

##### Training hours

In [None]:
df_comb['training_hours'].plot(kind = 'kde');

### Filling missing values
* We will fill all the missing values except the target column by Forward fill method

In [None]:
df_comb["enrolled_university"]=df_comb["enrolled_university"].fillna(method = 'ffill')
df_comb["education_level"]=df_comb["education_level"].fillna(method = 'ffill')
df_comb["major_discipline"]=df_comb["major_discipline"].fillna(method = 'ffill')
df_comb["last_new_job"]=df_comb["last_new_job"].fillna(method = 'ffill')
df_comb["company_type"]=df_comb["company_type"].fillna(method = 'ffill')
df_comb["company_type"]=df_comb["company_type"].fillna(method = 'bfill')
df_comb["company_size"]=df_comb["company_size"].fillna(method = 'ffill')
df_comb["company_size"]=df_comb["company_size"].fillna(method = 'bfill')
df_comb["experience"]=df_comb["experience"].fillna(method = 'ffill')
df_comb["gender"]=df_comb["gender"].fillna(method = 'ffill')

In [None]:
df_comb.isnull().sum()

* The missing values have been treated well now.

In [None]:
df_comb.dtypes

* Let us change the data type of the column experience as it is a numerical column

In [None]:
df_comb['experience'] = df_comb['experience'].astype('int64')
df_comb['last_new_job'] = df_comb['last_new_job'].astype('int64')

### Correlation plot

In [None]:
sns.heatmap(df_comb.corr(), annot = True);

### Inference
* last new job is moderately correlated with the experience column.
* experience is also positively correlated with the city_development index column.
* No other numerical columns are very highly correlated with each other.

#### Let us drop the unnecessary columns in the data
* As we do not need enrollee_id. We will delete this column
* City has many unique values and is also not interpretable as it is i codes. any of the cities are not significant when used in logistic regression.
* City development index is having high multicollinearity when checked with VIF

In [None]:
df_comb.drop(['enrollee_id', 'city', 'city_development_index'], axis = 1, inplace = True)

### Let's check the outliers 

In [None]:
df_comb.boxplot()

#### We will now separate the train set and the test set again

In [None]:
df_train.shape

In [None]:
train = df_comb.iloc[:19158]
test = df_comb.iloc[19158:]

print('Shape of train set:', train.shape)
print('Shape of test set:', test.shape)

In [None]:
train['target'].value_counts(dropna = False)

In [None]:
test['target'].value_counts(dropna = False)

In [None]:
test.drop('target', axis = 1, inplace = True)

### Let's summarize relationships among variables   

In [None]:
# Let's take all categorical columns in another dataframe

df_cat = train.select_dtypes(include = [np.object])
df_cat.columns

In [None]:
plt.rcParams['figure.figsize'] = (10,8)
def bar_(column):
    sns.countplot(train[i], hue = train['target'])
    plt.show()
    
for i in df_cat:
    print('Effect of', i , 'on target column')
    bar_(i)

### Inference
* We have more data of males and less of females.

In [None]:
# Let's check the relationship of numerical features with target

corr = train.corr()
corr['target'].plot.barh()
plt.show()

### Inference

* All the numeric components are negatively correlated with our target column

### Let us check if there is imbalance in the data

In [None]:
sns.countplot(train['target']);

In [None]:
train['target'].value_counts()

#### We can clearly see that there is imbalance in the target column
* We will treat the imbalance using SMOTE ANALYSIS

### SMOTE ANALYSIS

In [None]:
X_sm = train.drop(['target'], axis = 1)
X_sm = pd.get_dummies(X_sm, drop_first= True)
y_sm = train.target

In [None]:
print('Before Resampling Target 1', sum(y_sm == 1))
print('Before Resampling Target 0', sum(y_sm == 0))
from imblearn.over_sampling import SMOTE
smo = SMOTE(random_state = 5)
X_sm,y_sm = smo.fit_resample(X_sm,y_sm.ravel())
print('X shape', X_sm.shape)
print('y shape', y_sm.shape)
print('After Resampling Target 1', sum(y_sm == 1))
print('After Resampling Target 0', sum(y_sm == 0))

#### As we can see in the above output that our data has become balanced now. We can proceed further.

# Basemodel on the normal data

In [None]:
import statsmodels.api as sm
X = train.drop(['target'], axis = 1)
X = sm.add_constant(X)
X = pd.get_dummies(X, drop_first= True)
y = train.target



from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 10)

print('X_train', X_train.shape)
print('y_train', y_train.shape)


print('X_test', X_test.shape)
print('y_test', y_test.shape)

In [None]:
import statsmodels.api as sm

logreg = sm.Logit(y_train, X_train).fit()

# print the summary of the model
print(logreg.summary())

### Inference
* The pseudo R squared for base model is 0.05 which is very less. 
* We can say that it is not a good model at all.

In [None]:
sig_feat = logreg.pvalues[1:][logreg.pvalues[1:]<0.05].index
sig_feat

In [None]:
logit_y_pred_prob = logreg.predict(X_test)
logit_y_pred = [1 if x > 0.5 else 0 for x in logit_y_pred_prob]

In [None]:
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score

print('accuracy score', accuracy_score(y_test, logit_y_pred))
print('precision_score', precision_score(y_test, logit_y_pred))
print('recall_score', recall_score(y_test, logit_y_pred))
print('f1_score', f1_score(y_test, logit_y_pred))
print('roc_auc score', roc_auc_score(y_test, logit_y_pred))

In [None]:
print(classification_report(y_test, logit_y_pred))

In [None]:
cm = confusion_matrix(y_test, logit_y_pred)
conf_matrix = pd.DataFrame(data = cm, columns = ['Predicted:No', 'Predicted:Yes'], index = ['Actual:No', 'Actual:Yes'])
sns.heatmap(conf_matrix, annot = True, fmt = 'd', 
            cbar = False, cmap = 'OrRd', linewidth = 0.3, linecolor = 'black', annot_kws = {'size':25})
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
np.abs(logreg.params[1:]).sort_values().plot(kind = 'barh');

### Inference
* From all the above scores we can say that even if accuracy score is 0.75, all other scores are less than expected. 
* It must be beacuse of the imbalance in our target column.


#### Cohen's Kappa Score
* It is a measure of inter-rater reliability. For logistic regression the actual and predicted values of the target variables are the raters

In [None]:
from sklearn.metrics import cohen_kappa_score

cohen_kappa_score(y_test, logit_y_pred)

* The Cohen Kappa score of a good model is more than 0.5. 
* In our model there is no substantial agreement between the actual and predicted values. 


## We will try to improve overall efficiency of the model 

### Let us build one more Logistic model using the data with SMOTE

In [None]:
print('X Shape', X_sm.shape)
print('y Shape', y_sm.shape)

In [None]:
# Splitting the SMOTE data in train and test set

X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size = 0.3, random_state = 10)

print('Shape of X_train_sm ', X_train_sm.shape)
print('Shape of X_test_sm ', X_test_sm.shape)
print('Shape of y_train_sm ', y_train_sm.shape)
print('Shape of y_test_sm ', y_test_sm.shape)

In [None]:
logit_model_smote = sm.Logit(y_train_sm, X_train_sm).fit()
logit_model_smote.summary()

In [None]:
logit_sm_y_pred_prob = logit_model_smote.predict(X_test_sm)
logit_sm_y_pred = [1 if x > 0.5 else 0 for x in logit_sm_y_pred_prob]

In [None]:
print('accuracy score', accuracy_score(y_test_sm, logit_sm_y_pred))
print('precision_score', precision_score(y_test_sm, logit_sm_y_pred))
print('recall_score', recall_score(y_test_sm, logit_sm_y_pred))
print('f1_score', f1_score(y_test_sm, logit_sm_y_pred))
print('roc_auc score', roc_auc_score(y_test_sm, logit_sm_y_pred))

In [None]:
cohen_kappa_score(y_test_sm, logit_sm_y_pred)

#### Inference
* We can see that there is high change in metrics on using SMOTE but this may have created an error in the data 
* When we didn't use SMOTE the accuracy was better but precision, recall and F1 scores were very low.
* There is gradual change in these scores here.
* We can say that using SMOTE has really affected the model.


***We will use different models and check the accuracy scores***

#### Let us Define a Function for Confusion Matrix so than it can be used in every model

In [None]:
def plot_confusion_matrix(model,X_test,y_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    conf_matrix = pd.DataFrame(data = cm, columns = ['Predicted:0', 'Predicted:1'], index = ['Actual:0', 'Actual:1'])
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', 
                cbar = False, cmap = 'plasma', linewidth = 0.1, annot_kws = {'size':25})
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    plt.show()

#### Create a generalised function to check the metrics for the train and the test set

In [None]:
def get_train_report(model,X_train,y_train):
    train_pred = model.predict(X_train)
    print('Accuracy Score for train is ', accuracy_score(y_train, train_pred))
    print('Recall Score for train is ', recall_score(y_train, train_pred))
    print('Precision Score for train is ', precision_score(y_train, train_pred))
    print('F1 Score for train is ', f1_score(y_train, train_pred))
    return(classification_report(y_train, train_pred))

def get_test_report(model,X_test,y_test):
    test_pred = model.predict(X_test)
    print('Accuracy Score for test is ', accuracy_score(y_test, test_pred))
    print('Recall Score for test is ', recall_score(y_test, test_pred))
    print('Precision Score for test is ', precision_score(y_test, test_pred))
    print('F1 Score for test is ', f1_score(y_test, test_pred))
    return(classification_report(y_test, test_pred))

## Decision Tree Model

### Normal Data

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt_model = dt.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

In [None]:
print(get_train_report(dt_model,X_train, y_train))

In [None]:
print(get_test_report(dt_model,X_train, y_train))

### Inference
* Decision tree model tends to overfit on train as well as test data.
* Let us tune our hyperparameters using Gridsearch CCV

## GridSearchCV for Decision Tree model

In [None]:
from sklearn.model_selection import GridSearchCV

tuned_params = [{'criterion': ['entropy', 'gini'],
                'max_depth' : range(2, 6),
                'max_features': ['sqrt', 'log2']}]

decision_tree_classification = DecisionTreeClassifier(random_state = 10)
tree_grid = GridSearchCV(estimator = decision_tree_classification, param_grid = tuned_params, 
                        cv = 5)

tree_grid_model = tree_grid.fit(X_train, y_train)

tree_grid_model.best_params_

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_CV = DecisionTreeClassifier(criterion = 'entropy', max_depth = 2, max_features = 'sqrt')
dt_model_CV = dt_CV.fit(X_train, y_train)
dt_pred_CV = dt_model_CV.predict(X_test)

In [None]:
print(get_train_report(dt_model_CV,X_train, y_train))

In [None]:
print(get_test_report(dt_model_CV,X_test, y_test))

### Inference
* All the scores for 1 i.e. the person will leave the bank are 0.00. which clearly means that the results are predicted wrong.
* We can see that the given data is not working well in Logistic as well as Random forest model.
* So, henceforth we will use the data after applying SMOTE ANALYSIS to build the further models.


## DecisionTree on SMOTE DATA

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dt_model_sm = dtc.fit(X_train_sm, y_train_sm)
dt_pred_sm = dt_model_sm.predict(X_test_sm)

In [None]:
print(get_train_report(dt_model_sm,X_train_sm,y_train_sm))

In [None]:
print(get_test_report(dt_model_sm,X_test_sm,y_test_sm))

## Tune the Hyperparameters of the Decision tree

In [None]:
from sklearn.model_selection import GridSearchCV

tuned_params = [{'criterion': ['entropy', 'gini'],
                'max_depth' : range(2, 6),
                'max_features': ['sqrt', 'log2']}]

decision_tree_classification = DecisionTreeClassifier(random_state = 10)
tree_grid = GridSearchCV(estimator = decision_tree_classification, param_grid = tuned_params, 
                        cv = 5)

tree_grid_model = tree_grid.fit(X_train_sm, y_train_sm)

tree_grid_model.best_params_

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_CV_sm = DecisionTreeClassifier(criterion = 'gini', max_depth = 5, max_features = 'log2')
dt_model_CV_sm = dt_CV_sm.fit(X_train_sm, y_train_sm)
dt_pred_CV_sm = dt_model_CV_sm.predict(X_test_sm)

In [None]:
print(get_train_report(dt_model_CV_sm,X_train_sm, y_train_sm))

In [None]:
print(get_test_report(dt_model_CV_sm,X_test_sm, y_test_sm))

### Inference
* After tuning the hyperparameters, we can see that the data is consistent in both train and test set.
* The accuracy score is 0.66 which is a fine model

## KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_sm = KNeighborsClassifier()
knn_model_sm = knn_sm.fit(X_train_sm, y_train_sm)
knn_pred_sm = knn_model_sm.predict(X_test_sm)

In [None]:
print(get_train_report(knn_model_sm, X_train_sm,y_train_sm))

In [None]:
print(get_test_report(knn_model_sm, X_test_sm,y_test_sm))

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_sm = RandomForestClassifier()
rf_model_sm = rf_sm.fit(X_train_sm, y_train_sm)
rf_pred_sm = rf_model_sm.predict(X_test_sm)

In [None]:
print(get_train_report(rf_model_sm, X_train_sm, y_train_sm))

In [None]:
print(get_test_report(rf_model_sm, X_test_sm, y_test_sm))

### Tune the Hyperparameters using GridSearchCV (Random Forest)

In [None]:
# create a dictionary with hyperparameters and its values
# pass the criteria 'entropy' and 'gini' to the parameter, 'criterion' 
# pass a list of values to 'n_estimators' to build the different number of trees in the random forest
# pass a list of values to 'max_depth' that assigns maximum depth of the tree
# 'max_features' assigns maximum number of features to consider for the best split. We pass the string 'sqrt' and 'log2'
# 'sqrt' considers maximum number of features equal to the square root of total features
# 'log2' considers maximum number of features equal to the log of total features with base 2
# pass a list of values to 'min_samples_split' that assigns minimum number of samples to split an internal node
# pass a list of values to 'min_samples_leaf' that assigns minimum number of samples required at the terminal/leaf node
# pass a list of values to 'max_leaf_nodes' that assigns maximum number of leaf nodes in the tree
tuned_paramaters = [{'criterion': ['entropy', 'gini'],
                     'n_estimators': [10, 30, 50, 70],
                     'max_depth': [10, 15],
                     'max_features': ['sqrt', 'log2'],
                     }]
 
# instantiate the 'RandomForestClassifier' 
# pass the 'random_state' to obtain the same samples for each time you run the code
random_forest_classification = RandomForestClassifier(random_state = 10)

# use GridSearchCV() to find the optimal value of the hyperparameters
# estimator: pass the random forest classifier model
# param_grid: pass the list 'tuned_parameters'
# cv: number of folds in k-fold i.e. here cv = 5
rf_grid = GridSearchCV(estimator = random_forest_classification, 
                       param_grid = tuned_paramaters, 
                       cv = 5)

# use fit() to fit the model on the train set
rf_grid_model = rf_grid.fit(X_train_sm, y_train_sm)

# get the best parameters
print('Best parameters for random forest classifier: ', rf_grid_model.best_params_, '\n')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_sm_CV = RandomForestClassifier(criterion ='gini',max_depth= 15, max_features = 'sqrt', n_estimators = 50 )
rf_model_sm_CV = rf_sm_CV.fit(X_train_sm, y_train_sm)
rf_pred_sm_CV = rf_model_sm_CV.predict(X_test_sm)

In [None]:
print(get_train_report(rf_model_sm_CV, X_train_sm, y_train_sm))

In [None]:
print(get_test_report(rf_model_sm_CV, X_test_sm, y_test_sm))

### Ada Boost 

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# instantiate the 'AdaBoostClassifier'
# n_estimators: number of estimators at which boosting is terminated
# pass the 'random_state' to obtain the same results for each code implementation
ada_model = AdaBoostClassifier(n_estimators = 40, random_state = 10)

# fit the model using fit() on train data
ada_model_sm = ada_model.fit(X_train_sm, y_train_sm)
ada_pred_sm = ada_model_sm.predict(X_test_sm)

In [None]:
print(get_train_report(ada_model_sm, X_train_sm, y_train_sm))

In [None]:
print(get_test_report(ada_model_sm, X_test_sm, y_test_sm))

### Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# instantiate the 'GradientBoostingClassifier' 
# n_estimators: number of estimators to consider
# 'max_depth': assigns maximum depth of the tree
# pass the 'random_state' to obtain the same results for each code implementation
gboost_model = GradientBoostingClassifier(n_estimators = 150, max_depth = 10, random_state = 10)

# fit the model using fit() on train data
gboost_model_sm = gboost_model.fit(X_train_sm, y_train_sm)
gboost_pred_sm = gboost_model_sm.predict(X_test_sm)

In [None]:
print(get_train_report(gboost_model_sm, X_train_sm, y_train_sm))

In [None]:
print(get_test_report(gboost_model_sm, X_test_sm, y_test_sm))

### Tune Hyperparameters (Gradient Boost)

In [None]:
# create a dictionary with hyperparameters and its values
# learning_rate: pass the list of boosting learning rates
# max_depth: pass the range of values as the maximum tree depth for base learners
tuning_parameters = {'n_estimators': [i for i in range(10,15,2)],
                     'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                     'max_depth': [i for i in range(3,6,2)],
                     'max_features': [20,25,30],
                     'min_samples_leaf' : [i for i in range(500,2500,1000)]}

# instantiate the 'GradientBoostingClassifier' 
gboost_model = GradientBoostingClassifier()

# use GridSearchCV() to find the optimal value of the hyperparameters
# estimator: pass the Gradient Boost classifier model
# param_grid: pass the list 'tuned_parameters'
# cv: number of folds in k-fold i.e. here cv = 3
# scoring: pass a measure to evaluate the model on test set
gboost_grid = GridSearchCV(estimator = gboost_model, param_grid = tuning_parameters, cv = 3, scoring = 'roc_auc')

# fit the model on X_train and y_train using fit()
gboost_grid.fit(X_train_sm, y_train_sm)

# get the best parameters
print('Best parameters for GradientBoosting classifier: ', gboost_grid.best_params_, '\n')

In [None]:
gboost_model_CV = GradientBoostingClassifier(n_estimators = 14, max_depth = 5, learning_rate = 0.5, max_features = 25, min_samples_leaf = 500, random_state = 10)

# fit the model using fit() on train data
gboost_model_sm_CV = gboost_model_CV.fit(X_train_sm, y_train_sm)
gboost_pred_sm_CV = gboost_model_sm_CV.predict(X_test_sm)

In [None]:
print(get_train_report(gboost_model_sm_CV, X_train_sm, y_train_sm))

In [None]:
print(get_test_report(gboost_model_sm_CV, X_test_sm, y_test_sm))

### XG Boost

In [None]:
from xgboost import XGBClassifier

# instantiate the 'XGBClassifier'
# set the maximum depth of the tree using the parameter, 'max_depth'
# pass the value of minimum loss reduction required for partition of the leaf node to the parameter, 'gamma'
xgb_model = XGBClassifier(max_depth = 10, gamma = 1)

# fit the model using fit() on train data
xgb_model_sm = xgb_model.fit(X_train_sm, y_train_sm)
xgb_pred_sm = xgb_model_sm.predict(X_test_sm)

In [None]:
print(get_train_report(xgb_model_sm, X_train_sm, y_train_sm))

In [None]:
print(get_test_report(xgb_model_sm, X_test_sm, y_test_sm))

### Tune the Hyperparameters (GridSearchCV) for XGBoost

In [None]:
# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

# create a dictionary with hyperparameters and its values
# learning_rate: pass the list of boosting learning rates
# max_depth: pass the range of values as the maximum tree depth for base learners
# gamma: pass the list of minimum loss reduction values required to make a further partition on a leaf node of the tree
tuning_parameters = {'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                     'max_depth': range(3,10),
                     'gamma': [0, 1, 2, 3, 4]}

# instantiate the 'XGBClassifier' 
xgb_model = XGBClassifier()

# use GridSearchCV() to find the optimal value of the hyperparameters
# estimator: pass the XGBoost classifier model
# param_grid: pass the list 'tuned_parameters'
# cv: number of folds in k-fold i.e. here cv = 3
# scoring: pass a measure to evaluate the model on test set
xgb_grid = GridSearchCV(estimator = xgb_model, param_grid = tuning_parameters, cv = 3, scoring = 'roc_auc')

# fit the model on X_train and y_train using fit()
xgb_grid.fit(X_train_sm, y_train_sm)

# get the best parameters
print('Best parameters for XGBoost classifier: ', xgb_grid.best_params_, '\n')

In [None]:
xgb_model_CV = XGBClassifier(max_depth = 9, gamma = 0, learning_rate = 0.3)

# fit the model using fit() on train data
xgb_model_CV_sm = xgb_model_CV .fit(X_train_sm, y_train_sm)
xgb_pred_CV_sm = xgb_model_CV_sm.predict(X_test_sm)

In [None]:
print(get_train_report(xgb_model_CV_sm, X_train_sm, y_train_sm))

In [None]:
print(get_test_report(xgb_model_CV_sm, X_test_sm, y_test_sm))

## Comparative Study of the models

In [None]:
score_card = pd.DataFrame({'Log_Reg_Normal_Data': {'Accuracy': accuracy_score(y_test, logit_y_pred)*100 , 'Precision_score': precision_score(y_test, logit_y_pred)*100, 'recall_score': recall_score(y_test, logit_y_pred)*100, 'f1_score': f1_score(y_test, logit_y_pred)*100, 'roc_auc_score': roc_auc_score(y_test, logit_y_pred)*100},
                           'Log_Reg_SMOTE_Data': {'Accuracy': accuracy_score(y_test_sm, logit_sm_y_pred)*100 , 'Precision_score': precision_score(y_test_sm, logit_sm_y_pred)*100, 'recall_score': recall_score(y_test_sm, logit_sm_y_pred)*100 , 'f1_score': f1_score(y_test_sm, logit_sm_y_pred)*100, 'roc_auc_score': roc_auc_score(y_test_sm, logit_sm_y_pred)*100}, 
                           'DT_Normal_Data': {'Accuracy': accuracy_score(y_test, dt_pred)*100 , 'Precision_score': precision_score(y_test, dt_pred)*100, 'recall_score': recall_score(y_test, dt_pred)*100 , 'f1_score': f1_score(y_test, dt_pred)*100, 'roc_auc_score': roc_auc_score(y_test, dt_pred)*100},
                           'DT_CV_Normal Data': {'Accuracy': accuracy_score(y_test, dt_pred_CV)*100 , 'Precision_score': precision_score(y_test, dt_pred_CV)*100, 'recall_score': recall_score(y_test, dt_pred_CV)*100 , 'f1_score': f1_score(y_test, dt_pred)*100, 'roc_auc_score': roc_auc_score(y_test, dt_pred_CV)*100},
                           'DT_SMOTE_Data': {'Accuracy': accuracy_score(y_test_sm, dt_pred_sm)*100 , 'Precision_score': precision_score(y_test_sm, dt_pred_sm)*100, 'recall_score': recall_score(y_test_sm, dt_pred_sm)*100 , 'f1_score': f1_score(y_test_sm, dt_pred_sm)*100, 'roc_auc_score': roc_auc_score(y_test_sm, dt_pred_sm)*100},
                           'DT_CV_SMOTE Data': {'Accuracy': accuracy_score(y_test_sm, dt_pred_CV_sm)*100 , 'Precision_score': precision_score(y_test_sm, dt_pred_CV_sm)*100, 'recall_score': recall_score(y_test_sm, dt_pred_CV_sm)*100 , 'f1_score': f1_score(y_test_sm, dt_pred_CV_sm)*100, 'roc_auc_score': roc_auc_score(y_test_sm, dt_pred_CV_sm)*100},
                           'KNN': {'Accuracy': accuracy_score(y_test_sm, knn_pred_sm)*100 , 'Precision_score': precision_score(y_test_sm, knn_pred_sm)*100, 'recall_score': recall_score(y_test_sm, knn_pred_sm)*100 , 'f1_score': f1_score(y_test_sm, knn_pred_sm)*100, 'roc_auc_score': roc_auc_score(y_test_sm, knn_pred_sm)*100},
                           'Random_Forest': {'Accuracy': accuracy_score(y_test_sm, rf_pred_sm)*100 , 'Precision_score': precision_score(y_test_sm, rf_pred_sm)*100, 'recall_score': recall_score(y_test_sm, rf_pred_sm)*100 , 'f1_score': f1_score(y_test_sm, rf_pred_sm)*100, 'roc_auc_score': roc_auc_score(y_test_sm, rf_pred_sm)*100},
                           'RF_CV': {'Accuracy': accuracy_score(y_test_sm, rf_pred_sm_CV)*100 , 'Precision_score': precision_score(y_test_sm, rf_pred_sm_CV)*100, 'recall_score': recall_score(y_test_sm, rf_pred_sm_CV)*100 , 'f1_score': f1_score(y_test_sm, rf_pred_sm_CV)*100, 'roc_auc_score': roc_auc_score(y_test_sm, rf_pred_sm_CV)*100},
                           'Ada_Boost': {'Accuracy': accuracy_score(y_test_sm, ada_pred_sm)*100 , 'Precision_score': precision_score(y_test_sm, ada_pred_sm)*100, 'recall_score': recall_score(y_test_sm, ada_pred_sm)*100 , 'f1_score': f1_score(y_test_sm, ada_pred_sm)*100, 'roc_auc_score': roc_auc_score(y_test_sm, ada_pred_sm)*100},
                           'Gradient_Boost': {'Accuracy': accuracy_score(y_test_sm, gboost_pred_sm)*100 , 'Precision_score': precision_score(y_test_sm, gboost_pred_sm)*100, 'recall_score': recall_score(y_test_sm, gboost_pred_sm)*100 , 'f1_score': f1_score(y_test_sm, gboost_pred_sm)*100, 'roc_auc_score': roc_auc_score(y_test_sm, gboost_pred_sm)*100},
                           'Gradient_Boost_CV': {'Accuracy': accuracy_score(y_test_sm, gboost_pred_sm_CV)*100 , 'Precision_score': precision_score(y_test_sm, gboost_pred_sm_CV)*100, 'recall_score': recall_score(y_test_sm, gboost_pred_sm_CV)*100 , 'f1_score': f1_score(y_test_sm, gboost_pred_sm_CV)*100, 'roc_auc_score': roc_auc_score(y_test_sm, gboost_pred_sm_CV)*100},
                           'XG_Boost': {'Accuracy': accuracy_score(y_test_sm, xgb_pred_sm)*100 , 'Precision_score': precision_score(y_test_sm, xgb_pred_sm)*100, 'recall_score': recall_score(y_test_sm, xgb_pred_sm)*100 , 'f1_score': f1_score(y_test_sm, xgb_pred_sm)*100, 'roc_auc_score': roc_auc_score(y_test_sm, xgb_pred_sm)*100},
                           'XG_Boost_CV': {'Accuracy': accuracy_score(y_test_sm, xgb_pred_CV_sm)*100 , 'Precision_score': precision_score(y_test_sm, xgb_pred_CV_sm)*100, 'recall_score': recall_score(y_test_sm, xgb_pred_CV_sm)*100 , 'f1_score': f1_score(y_test_sm, xgb_pred_CV_sm)*100, 'roc_auc_score': roc_auc_score(y_test_sm, xgb_pred_CV_sm)*100}})
                      
                
score_card.head()

### Inference

* Based on above table, we can observe that Gradient Boosting has the highest ROC value as well as all other values and hence, we will select Gradient Boosting as our model for prediction.


### Inferences for Gradient Boosting model

#### Confusion_matrix

In [None]:
plot_confusion_matrix(gboost_model_sm,X_test_sm,y_test_sm)

#### ROC-AUC-Curve

In [None]:
y_pred_prob = gboost_model_sm.predict_proba(X_test_sm)[:,1]

# the roc_curve() returns the values for false positive rate, true positive rate and threshold
# pass the actual target values and predicted probabilities to the function
fpr, tpr, thresholds = roc_curve(y_test_sm, y_pred_prob)

# plot the ROC curve
plt.plot(fpr, tpr)

# set limits for x and y axes
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

# plot the straight line showing worst prediction for the model
plt.plot([0, 1], [0, 1],'r--')

# add plot and axes labels
# set text size using 'fontsize'
plt.title('ROC curve for Admission Prediction Classifier', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)

# add the AUC score to the plot
# 'x' and 'y' gives position of the text
# 's' is the text 
# use round() to round-off the AUC score upto 4 digits
plt.text(x = 0.82, y = 0.3, s = ('AUC Score:',round(roc_auc_score(y_test_sm, y_pred_prob),4)))

# plot the grid
plt.grid(True)

#### Important Features

In [None]:
# create a dataframe that stores the feature names and their importance
# 'feature_importances_' returns the features based on the average gain 
important_features = pd.DataFrame({'Features': X_train_sm.columns, 
                                   'Importance': gboost_model_sm.feature_importances_})

# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)

# create a barplot to visualize the features based on their importance
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

# display the plot
plt.show()