In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

Problem Statement:
    
Here our objective is to build a model which will predict wheather a person is trying to change his/her job or not? 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV



In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

In [None]:
train.head()

Before Dig into the main analysing process process Let's get familliar with the datasets.


#Meaning of each individual features.
enrollee_id : Unique ID for candidate

city: City code

city_ development _index : Developement index of the city (scaled)

gender: Gender of candidate

relevent_experience: Relevant experience of candidate

enrolled_university: Type of University course enrolled if any

education_level: Education level of candidate

major_discipline :Education major discipline of candidate

experience: Candidate total experience in years

company_size: No of employees in current employer's company

company_type : Type of current employer

lastnewjob: Difference in years between previous job and current job

training_hours: training hours completed

target: 0 – Not looking for job change, 1 – Looking for a job change


In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.describe()

Here one interesting things to notice is all the Numerical features are not contains Null values.

In [None]:
#Let's have a look of missing value
percent_missing = train.isnull().sum() * 100 / len(train)
missing_value_df = pd.DataFrame({#'column_name': train.columns,
                                 'percent_missing': percent_missing})
percent_missing

In [None]:
msno.heatmap(train)

Here we are seeing that most of the features that contains missing values are categorical 
and we can fill those value by mode , but this could leads to imbalanced data and make our ml model baised
 so it's better to delete those.

In [None]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [None]:
train.isna().mean()

I don't think enrolle_id, city, company_size will be handy to take in our process.So drop this three columns. 

In [None]:
train.drop(['enrollee_id','city','company_size'], axis = 1, inplace = True)

In [None]:
train['experience'].unique()

Here we are seeing that there are > and < sign with 20 and 1. So before Modeling let's solve this by adding and Substracting 1  with 20 and 1 

In [None]:
def replace(experience):
    if experience == '>20':
        return 21
    elif experience == '<1':
        return 0

    else:
        return experience

In [None]:
train.experience = train.experience.map(replace)

In [None]:
train['experience'].unique()

In [None]:
def replace(last_new_job):
    if last_new_job == '>4':
        return 5
    elif last_new_job == 'never':
        return 0

    else:
        return last_new_job

train.last_new_job = train.last_new_job.map(replace)
train['last_new_job'].unique()

In [None]:
#Now let's have look at my data and hope it's clean now
train.head()

In [None]:
#First Check weather our dataset is balanced or not?
values = train['target'].value_counts().values.tolist()
labels = train['target'].value_counts().index
plt.figure(figsize= (10,10))
plt.title('Comparing labels of target feature')
plt.pie(x = values, labels = labels, autopct='%1.1f%%', pctdistance= .5)
plt.show()


Here we are seeing that our dataset is imbalanced we have to fix this before modeling.  

In [None]:

fig, ax = plt.subplots(3,2, figsize = (12,12))
((ax1, ax2), (ax3, ax4), (ax5, ax6)) = ax

labels = train['gender'].value_counts().index
values = train['gender'].value_counts().tolist()
ax1.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True)
ax1.set_title("Gender Distribution Pie Chart", fontdict={'fontsize': 14})

labels = train['relevent_experience'].value_counts().index
values = train['relevent_experience'].value_counts().tolist()
ax2.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2])
ax2.set_title("Experience Distribution Pie Chart", fontdict={'fontsize': 14})

labels = train['enrolled_university'].value_counts().index
values = train['enrolled_university'].value_counts().tolist()
ax3.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2,.3])
ax3.set_title("Enrooled University Distribution Pie Chart", fontdict={'fontsize': 14})

labels = train['education_level'].value_counts().index
values = train['education_level'].value_counts().tolist()
ax4.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.05,.1])
ax4.set_title("Education label Distribution Pie Chart", fontdict={'fontsize': 14})

labels = train['major_discipline'].value_counts().index
values = train['major_discipline'].value_counts().tolist()
ax5.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0.1, 0.1, 0.1, 0.1, 0.2, 0.1])
ax5.set_title("Major_discipline Distribution Pie Chart", fontdict={'fontsize': 14})



labels = train['company_type'].value_counts().index
values = train['company_type'].value_counts().tolist()
ax6.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.1,.1,.1, .15,.1])
ax6.set_title("Company Type Pie Chart", fontdict={'fontsize': 14})


plt.tight_layout()
plt.show()

Let's Make Some obsevations from the visualizations.

1. Almost 90% people are male who were takong that course.

2. Almost 87% candidate has realvent experience in Data Science and rest of them don't have any
experience but interasted in this field.

3. Almost 85% candidate were enrolled in University.

4. ALmost 70% guy were graduate and interesting is around 3% phd guy also there.

5. It's natural that most of the candidate will be form STEM background. But From rest of the groups
candidates from humanities are interested more.

6. And most of the guys are from private sector.

Let's see how all the categorical features effecting in target variable.

In [None]:
fig_dims = (20, 14)
fig, ax =plt.subplots(3,2,figsize = fig_dims)
sns.countplot(x = train['gender'],hue = train['target'], ax=ax[0,0], edgecolor=sns.color_palette("dark", 60))
sns.countplot(train['education_level'],hue = train['target'], ax=ax[0,1])
sns.countplot(x = train['relevent_experience'],hue = train['target'], ax=ax[1,0])
sns.countplot(train['enrolled_university'],hue = train['target'], ax=ax[1,1])
sns.countplot(x = train['major_discipline'],hue = train['target'], ax=ax[2,0])
sns.countplot(x = train['company_type'],hue = train['target'], ax=ax[2,1])


fig.suptitle('Features distribution based on target ',fontsize=40)
fig.show()


Let's make some observations.

Here we are seeing that in each Chart any one class having majority of the data points.That's why we cann't
compare confidently. So here ratio could be a good factor to compare.

1. Though something to see is even people from public sector are also getting interest in Data Science.

2. Peopler are from Arts background are completly not interested in switching job.


Let's try to explore Numerical coulumns.And have their distribution with respect to target columns.


In [None]:
g = sns.kdeplot(train['city_development_index'][(train["target"] == 0) & (train['city_development_index'].notnull())], color="Red", shade = True)
g = sns.kdeplot(train['city_development_index'][(train["target"] == 1) & (train['city_development_index'].notnull())], ax =g, color="Blue", shade= True)
g.set_xlabel('city_development_index')
g.set_ylabel("Frequency")
g = g.legend(["Not looking for job change,","looking for job change,"])

Make Obsebations.

1. Peoples city_development_index ranging in around .666 and and around .9 are interested in changing.

2. Peoples are toatally not interested in job change whose city_development_index are nearly .9

In [None]:
## new City_devlopment_cat feature based on this obserbations.
bins = [0,.45,.67,.84,1]
labels=[0,1,2,3]
train['City_devlopment_cat'] = pd.cut(train['city_development_index'], bins=bins, labels=labels)
train[['City_devlopment_cat', 'target']].groupby(['City_devlopment_cat'], as_index=False).mean().sort_values(by='target', ascending=False)

In [None]:
train['experience'] = train['experience'].astype(int)

In [None]:
g = sns.kdeplot(train['experience'][(train["target"] == 0) & (train['experience'].notnull())], color="Red", shade = True)
g = sns.kdeplot(train['experience'][(train["target"] == 1) & (train['experience'].notnull())], ax =g, color="Blue", shade= True)
g.set_xlabel('experience')
g.set_ylabel("Frequency")
g = g.legend(["Not looking for job change,","looking for job change,"])

Make Obserbations.

1. People ranging  experience from 1 to 10 years are most likely to change.
2. People  having experience of around 20 years are not looking to change the job.

In [None]:
train['last_new_job'] = train['last_new_job'].astype(int)

In [None]:
g = sns.kdeplot(train['last_new_job'][(train["target"] == 0) & (train['last_new_job'].notnull())], color="Red", shade = True)
g = sns.kdeplot(train['last_new_job'][(train["target"] == 1) & (train['last_new_job'].notnull())], ax =g, color="Blue", shade= True)
g.set_xlabel('last_new_job')
g.set_ylabel("Frequency")
g = g.legend(["Not looking for job change,","looking for job change,"])

make obserbations

1.People left their last job from 1 to 2 years ago are most likely to change job
 and also same for who are not wanted to change job.

In [None]:
g = sns.kdeplot(train['training_hours'][(train["target"] == 0) & (train['training_hours'].notnull())], color="Red", shade = True)
g = sns.kdeplot(train['training_hours'][(train["target"] == 1) & (train['training_hours'].notnull())], ax =g, color="Green", shade= True)
g.set_xlabel('training_hours')
g.set_ylabel("Frequency")
g = g.legend(["Not looking for job change,","looking for job change,"])

Nothing to say about this graph.

In [None]:
sns.heatmap(train.corr(), annot = True, vmin=-1, vmax=1, center= 0,
            cmap= 'Blues_r', linewidths=3, linecolor='black')


In [None]:
train['target'].value_counts()

In [None]:
#Let's use pivot table to analyze it.
table = pd.pivot_table(train,index=['gender'])
table
table.plot(kind='bar')

Here one interesting point to notice is Female have a higher training hours then others.

In [None]:
table = pd.pivot_table(train,index=['gender','target'])
table
table.plot(kind='line')

Obervations:
    
1. Who are wanted to change the job their training hour is less then 
who don't want in all three catrgory.

2. Who are wanted to change the job have less experience then is  then 
who don't want in all three catrgory.
 

In [None]:
table = pd.pivot_table(train,index=['gender','target','education_level'])
print(table)
table.plot(kind='bar',
           figsize = (15,8),
           colormap = 'RdGy')

Obsevations

1. Female who want to change job have a much more higher training
hours then others and Education_level is PHD and much more experience also.

2. And in Others ccategory only graduate are wanted to change job.

In [None]:
table = pd.pivot_table(train,
                       index=['gender','education_level'],
                       aggfunc={'target':np.sum}
                      )
print(table)
table.plot(kind='barh',
           figsize = (15,8),
           colormap = 'tab10_r',
           title = 'Gender and Educatiuon lebel Relationship')

observations:

1. Female graduate candidates are more in numbers in changing job.

2. Same case for male also as graduate completed students have a intent for a job.

In [None]:
table = pd.pivot_table(train,
                       index=['gender','enrolled_university','education_level'],
                       aggfunc={'target':np.sum}
                      )
print(table)
table.plot(kind='bar',
           figsize = (15,8),
           colormap = 'tab10_r',
           title = 'Gender and Educatiuon lebel and Enrolled University Relationship')

Observations:
    
1. male Graduate candidate having no_enrollment are much more
interested in job then part time and Full time. Same cases for Female also.

2. Tree structure based algorithm could be handy for this particuler case.

In [None]:
table = pd.pivot_table(train,
                       index=['company_type','last_new_job'],
                       aggfunc={'target':np.sum}
                      )
print(table)
table.plot(kind='bar',
           figsize = (15,8),
           colormap = 'tab10_r',
           title = 'Company_type and last_new_job Relationship')

Observations

1. Working in various types of company but having 1 years experienced candudate are mostly interested 
in changing job.

2. And Candidate Working in NGO and Public Sector having experience of 5 years also partly interested in changing job.

In [None]:
table = pd.pivot_table(train,
                       index=['last_new_job'],
                       columns = ['enrolled_university'],
                       aggfunc={'target':np.sum}
                      )
print(table)
table.plot(kind='bar',
           figsize = (15,8),
           colormap = 'Set1_r',
           title = 'Enrolled and last_new_job Relationship')

Observations
1. Candidate who are enrolled in a part time courese are less interested in changing job.

2. Candidate emrolled in a university and having 1 years experience are mostly interesrted in changing job.

In [None]:
table = pd.pivot_table(train,
                       index=['major_discipline','gender'],
                       columns = ['enrolled_university'],
                       values = ['target'],
                       aggfunc=np.sum
                       
                      )
print(table)
table.plot(kind='bar',
           figsize = (15,8),
           colormap = 'Set2_r',
           title = 'Enrolled and major discipline Relationship')          

# Encoding The columns

In [None]:
df = train.copy()

In [None]:
df = pd.get_dummies(df,
                    columns=['gender', 'relevent_experience', 'enrolled_university', 'major_discipline', 'company_type'],
                    drop_first=True)

As education_level is an ordinal categorical features that's why we have to map the each lavel with keeping order in mind.


In [None]:
df['education_level'] = df['education_level'].map( {'Graduate': 0, 'Masters': 1,'Phd': 2} ).astype(int)


In [None]:
X = df.drop(['target'], axis=1)
y = df['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
ScalerX = StandardScaler()
X_train = ScalerX.fit_transform(X_train)
X_test = ScalerX.transform(X_test)

# Handling imbalance data using SMOTE based techniques

### A) SMOTE Technique¶

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE
smt = SMOTE()
#X_train, y_train = smt.fit_resample(X_train, y_train)
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After',counter)

### B) ADASYN Technique

In [None]:
from imblearn.over_sampling import ADASYN

counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using ADASYN
ada = ADASYN(random_state=130)
X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)

counter = Counter(y_train_ada)
print('After',counter)


### C) Hybrid Techniques


#### C.1) SMOTE + Tomek Links

In [None]:
from imblearn.combine import SMOTETomek

counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE + Tomek
smtom = SMOTETomek(random_state=139)
X_train_smtom, y_train_smtom = smtom.fit_resample(X_train, y_train)

counter = Counter(y_train_smtom)
print('After',counter)


### C.2) SMOTE + ENN


In [None]:
from imblearn.combine import SMOTEENN

counter = Counter(y_train)
print('Before',counter)
#oversampling the train dataset using SMOTE + ENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)

counter = Counter(y_train_smenn)
print('After',counter)

### C.3) SMOTE + Under Sampling 

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
sm = SMOTE(sampling_strategy = .3)
rus =  RandomUnderSampler(sampling_strategy=.4)

pipeline = Pipeline(steps = [('smote', sm),('under',rus)])

counter = Counter(y_train)
print('Before',counter)
#over and undersampling the train dataset using SMOTE + RandomUnderSampler
X_train_smrus, y_train_smrus = pipeline.fit_resample(X_train, y_train)

counter = Counter(y_train_smrus)
print('After',counter)

# Model Building

In [None]:
model = list()
resample = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()

In [None]:
def test_eval(clf_model, X_test, y_test, algo=None, sampling=None):
    # Test set prediction
    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    plot_confusion_matrix(clf_model, X_test, y_test)  
    plt.show() 
    #print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('AUC-ROC')
    print('='*60)
    print(roc_auc_score(y_test, y_prob[:,1]))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred))
    recall.append(recall_score(y_test,y_pred))
    F1score.append(f1_score(y_test,y_pred))
    AUCROC.append(roc_auc_score(y_test, y_prob[:,1]))
    resample.append(sampling)


# Model-1: Logistic Regression

### 1. Original Unsampled Data

In [None]:
log_model=LogisticRegression()

params={'C':np.logspace( -10, 1, 15),'class_weight':[None,'balanced'],'penalty':['l1','l2']}

cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True)

# Create grid search using 5-fold cross validation
clf_LR = GridSearchCV(log_model, params, cv=cv, scoring='roc_auc', n_jobs=-1)
clf_LR.fit(X_train, y_train)
clf_LR.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'actual')

### 2.SMOTE Resampling

In [None]:
clf_LR.fit(X_train_sm, y_train_sm)
clf_LR.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'smote')

### 3.ADASYN Resampling

In [None]:
clf_LR.fit(X_train_ada, y_train_ada)
clf_LR.best_estimator_


In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'adasyn')

### 4.SMOTE + Tomek Resampling

In [None]:
clf_LR.fit(X_train_smtom, y_train_smtom)
clf_LR.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'smote+tomek')

### 5.SMOTE + ENN Resampling

In [None]:
clf_LR.fit(X_train_smenn, y_train_smenn)
clf_LR.best_estimator_


In [None]:

test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'smote+enn')

### 6. SMOTE + Under Sampling

In [None]:
clf_LR.fit(X_train_smrus, y_train_smrus)
clf_LR.best_estimator_


In [None]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'smote+rus')

# Model-2: Decision Tree

In [None]:
estimators = [2,10,30,50,100]
# Maximum number of depth in each tree:
max_depth = [i for i in range(5,16,2)]
# Minimum number of samples to consider to split a node:
min_samples_split = [2, 5, 10, 15, 20, 50, 100]
# Minimum number of samples to consider at each leaf node:
min_samples_leaf = [1, 2, 5]
#Impurity
criterion = ['gini', 'entropy']
#The number of features to consider when looking for the best split
max_features = ['log2', 'sqrt', 'auto']


### 1. Original Unsampled Data****

In [None]:
tree_model = DecisionTreeClassifier()
tree_param_grid = { 
    'max_features':max_features,
    'criterion':criterion,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

clf_DT = RandomizedSearchCV(tree_model, tree_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=2)
clf_DT.fit(X_train, y_train)
clf_DT.best_estimator_

In [None]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree', 'actual')

### 2.SMOTE Resampling

In [None]:
clf_DT.fit(X_train_sm, y_train_sm)
clf_DT.best_estimator_

In [None]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree', 'smote')

### 3.ADASYN Resampling

In [None]:
clf_DT.fit(X_train_ada, y_train_ada)
clf_DT.best_estimator_


In [None]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree', 'adasyn')

### 4. SMOTE + Tomek Resampling

In [None]:
clf_DT.fit(X_train_smtom, y_train_smtom)
clf_DT.best_estimator_

In [None]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree', 'smote+tomek')

### 5.SMOTE + ENN Resampling

In [None]:
clf_DT.fit(X_train_smenn, y_train_smenn)
clf_DT.best_estimator_

In [None]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree', 'smote+enn')

### 6. SMOTE + Under Sampling

In [None]:
clf_DT.fit(X_train_smrus, y_train_smrus)
clf_DT.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Decision Tree', 'smote+rus')

# Model-3: Random Forest

### 1. Original Unsampled Data

In [None]:
rf_model = RandomForestClassifier()

rf_params={'n_estimators':estimators,
           'max_features':max_features,
           'criterion':criterion,
           'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf}

clf_RF = RandomizedSearchCV(rf_model, rf_params, cv=cv, scoring='roc_auc', n_jobs=-1, n_iter=20, verbose=2)
clf_RF.fit(X_train, y_train)
clf_RF.best_estimator_

In [None]:
test_eval(clf_RF, X_test, y_test, 'Random Forest', 'actual')

### 2.SMOTE Resampling

In [None]:
clf_RF.fit(X_train_sm, y_train_sm)
clf_RF.best_estimator_

In [None]:
test_eval(clf_RF, X_test, y_test, 'Random Forest', 'smote')

### 3.ADASYN Resampling

In [None]:
clf_RF.fit(X_train_ada, y_train_ada)
clf_RF.best_estimator_

In [None]:
test_eval(clf_RF, X_test, y_test, 'Random Forest', 'adasyn')

### 4. SMOTE + Tomek Resampling

In [None]:
clf_RF.fit(X_train_smtom, y_train_smtom)
clf_RF.best_estimator_

In [None]:
test_eval(clf_RF, X_test, y_test, 'Random Forest', 'smote+tomek')

### 5. SMOTE + ENN Resampling

In [None]:
clf_RF.fit(X_train_smenn, y_train_smenn)
clf_RF.best_estimator_

In [None]:
test_eval(clf_RF, X_test, y_test, 'Random Forest', 'smote+enn')

### 6. SMOTE + Under Sampling

In [None]:
clf_RF.fit(X_train_smrus, y_train_smrus)
clf_RF.best_estimator_

In [None]:
test_eval(clf_LR, X_test, y_test, 'Random Forest', 'smote+rus')

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Model-4: AdaBoast

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf_ada=AdaBoostClassifier()
clf_ada.fit(X_train, y_train)
#Actual data
test_eval(clf_ada, X_test, y_test, 'AdaBoast', 'actual')

In [None]:
#smote
clf_ada.fit(X_train_sm, y_train_sm)
test_eval(clf_ada, X_test, y_test, 'AdaBoast', 'Smote')

#Adasyn
clf_ada.fit(X_train_ada, y_train_ada)
test_eval(clf_ada, X_test, y_test, 'AdaBoast', 'adasyn')

#smote + tomek
clf_ada.fit(X_train_smtom, y_train_smtom)
test_eval(clf_ada, X_test, y_test, 'AdaBoast', 'smote+tomek')

#smote + enn
clf_ada.fit(X_train_smenn, y_train_smenn)
test_eval(clf_ada, X_test, y_test, 'AdaBoast', 'smote+enn')

#smote + 
clf_ada.fit(X_train_smrus, y_train_smrus)
test_eval(clf_ada, X_test, y_test, 'AdaBoast', 'smote+rus')



# Model-5: GradientBoast

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()

gb_params = { 
    "n_estimators":[1,3,5,10,15,20,30,40,50,],
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

clf_gb=RandomizedSearchCV(gb_model,gb_params,cv=cv, scoring='roc_auc',n_jobs=1)

clf_gb.fit(X_train, y_train)
clf_gb.best_estimator_


In [None]:
#Actual data
#clf_gb.fit(X_train, y_train)
#clf_gb.best_estimator_
test_eval(clf_gb, X_test, y_test, 'GradientBoast', 'actual')
#smote
clf_gb.fit(X_train_sm, y_train_sm)
clf_gb.best_estimator_
test_eval(clf_gb, X_test, y_test, 'GradientBoast', 'Smote')

#Adasyn
clf_gb.fit(X_train_ada, y_train_ada)
clf_gb.best_estimator_
test_eval(clf_gb, X_test, y_test, 'GradientBoast', 'adasyn')

#smote + tomek
clf_gb.fit(X_train_smtom, y_train_smtom)
clf_gb.best_estimator_
test_eval(clf_gb, X_test, y_test, 'GradientBoast', 'smote+tomek')

#smote + enn
clf_gb.fit(X_train_smenn, y_train_smenn)
clf_gb.best_estimator_
test_eval(clf_gb, X_test, y_test, 'GradientBoast', 'smote+enn')

#smote + rus
clf_gb.fit(X_train_smrus, y_train_smrus)
clf_gb.best_estimator_
test_eval(clf_gb, X_test, y_test, 'GradientBoast', 'smote+rus')


# Model-6: SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_params = {
    "loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
    "alpha" : [0.0001, 0.001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"],
}

sgd_model = SGDClassifier()
clf_sgd=RandomizedSearchCV(sgd_model,sgd_params,cv=cv, scoring='roc_auc',n_jobs=1)

clf_sgd.fit(X_train, y_train)
clf_sgd.best_estimator_


In [None]:
#Actual data
#clf_gb.fit(X_train, y_train)
#clf_gb.best_estimator_
test_eval(clf_sgd, X_test, y_test, 'SGDClassifier', 'actual')
#smote
clf_sgd.fit(X_train_sm, y_train_sm)
clf_sgd.best_estimator_
test_eval(clf_sgd, X_test, y_test, 'SGDClassifier', 'Smote')

#Adasyn
clf_sgd.fit(X_train_ada, y_train_ada)
clf_sgd.best_estimator_
test_eval(clf_sgd, X_test, y_test, 'SGDClassifier', 'adasyn')

#smote + tomek
clf_sgd.fit(X_train_smtom, y_train_smtom)
clf_sgd.best_estimator_
test_eval(clf_sgd, X_test, y_test, 'SGDClassifier', 'smote+tomek')

#smote + enn
clf_sgd.fit(X_train_smenn, y_train_smenn)
clf_sgd.best_estimator_
test_eval(clf_sgd, X_test, y_test, 'SGDClassifier', 'smote+enn')

#smote + rus
clf_sgd.fit(X_train_smrus, y_train_smrus)
clf_sgd.best_estimator_
test_eval(clf_sgd, X_test, y_test, 'SGDClassifier', 'smote+rus')


# Model-7: LGBMClassifier

In [None]:
import lightgbm as lgbm

In [None]:

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

lgb_model = lgbm.LGBMClassifier()
lgb_params ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 200), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'bagging_fraction': sp_uniform(0.5, 0.8),
             #'bagging_frequency': sp_randint(5, 8),
             'feature_fraction': sp_uniform(0.5, 0.8),
             'max_depth': sp_randint(10, 13),
             'min_data_in_leaf': sp_randint(50, 80),}
#clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
clf_lgb=RandomizedSearchCV(lgb_model,lgb_params,cv=cv, scoring='roc_auc',n_jobs=1)

clf_lgb.fit(X_train, y_train)
clf_lgb.best_estimator_


In [None]:
#Actual data
#clf_gb.fit(X_train, y_train)
#clf_gb.best_estimator_
test_eval(clf_lgb, X_test, y_test, 'LGBMClassifier', 'actual')
#smote
clf_lgb.fit(X_train_sm, y_train_sm)
clf_lgb.best_estimator_
test_eval(clf_lgb, X_test, y_test, 'LGBMClassifier', 'Smote')

#Adasyn
clf_lgb.fit(X_train_ada, y_train_ada)
clf_lgb.best_estimator_
test_eval(clf_lgb, X_test, y_test, 'LGBMClassifier', 'adasyn')

#smote + tomek
clf_lgb.fit(X_train_smtom, y_train_smtom)
clf_lgb.best_estimator_
test_eval(clf_lgb, X_test, y_test, 'LGBMClassifier', 'smote+tomek')

#smote + enn
clf_lgb.fit(X_train_smenn, y_train_smenn)
clf_lgb.best_estimator_
test_eval(clf_lgb, X_test, y_test, 'LGBMClassifier', 'smote+enn')

#smote + rus
clf_lgb.fit(X_train_smrus, y_train_smrus)
clf_lgb.best_estimator_
test_eval(clf_lgb, X_test, y_test, 'LGBMClassifier', 'smote+rus')


# Model-8: MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier
clf_mlp  = MLPClassifier()
clf_mlp.fit(X_train, y_train)


In [None]:
#Actual data
#clf_gb.fit(X_train, y_train)
#clf_gb.best_estimator_
test_eval(clf_mlp, X_test, y_test, 'MLPClassifier', 'actual')
#smote
clf_mlp.fit(X_train_sm, y_train_sm)
#clf_mlp.best_estimator_
test_eval(clf_mlp, X_test, y_test, 'MLPClassifier', 'Smote')

#Adasyn
clf_mlp.fit(X_train_ada, y_train_ada)
#clf_mlp.best_estimator_
test_eval(clf_mlp, X_test, y_test, 'MLPClassifier', 'adasyn')

#smote + tomek
clf_mlp.fit(X_train_smtom, y_train_smtom)
#clf_mlp.best_estimator_
test_eval(clf_mlp, X_test, y_test, 'MLPClassifier', 'smote+tomek')

#smote + enn
clf_mlp.fit(X_train_smenn, y_train_smenn)
#clf_mlp.best_estimator_
test_eval(clf_mlp, X_test, y_test, 'MLPClassifier', 'smote+enn')

#smote + rus
clf_mlp.fit(X_train_smrus, y_train_smrus)
#clf_mlp.best_estimator_
test_eval(clf_mlp, X_test, y_test, 'MLPClassifier', 'smote+rus')


# Model Comparision

In [None]:
clf_eval_df = pd.DataFrame({'model':model,
                            'resample':resample,
                            'precision':precision,
                            'recall':recall,
                            'f1-score':F1score,
                            'AUC-ROC':AUCROC})
clf_eval_df

In [None]:
sns.set(font_scale=1.2)
#sns.palplot(sns.color_palette())
g = sns.FacetGrid(clf_eval_df, col="model", height=5)
g.map(sns.barplot, "resample", "precision", palette='twilight', order=["actual", "smote", "adasyn", "smote+tomek", "smote+enn","smote+rus"])
g.set_xticklabels(rotation=30)
g.set_xlabels(' ', fontsize=14)
g.set_ylabels('Precision', fontsize=14)