In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
from plotly.figure_factory import create_table
import plotly.express as px
import plotly.graph_objs as go
import plotly
from plotly import tools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
init_notebook_mode(connected=True)
pd.set_option('display.max_columns', 100)
from plotly.figure_factory import create_table
df_eda = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv").drop(columns = ['enrollee_id'])


## **HR Analytics Data Scientists Job Change Prediction Model**

![](https://www.datocms-assets.com/14946/1596797558-da-vs-ds-diagram.png)

In this notebook I will build and optimize step by step a LightGBM model that tries to predict whether a data scientist is looking to change his job or not.  

**Available features:**

enrollee_id : Unique ID for candidate  
city: City code
city_development_index : Developement index of the city (scaled)  
gender: Gender of candidate  
relevent_experience: Relevant experience of candidate  
enrolled_university: Type of University course enrolled if any  
education_level: Education level of candidate  
major_discipline: Education major discipline of candidate  
experience: Candidate total experience in years  
company_size: No of employees in current employer's company  
company_type: Type of current employer  
lastnewjob: Difference in years between previous job and current job  
training_hours: training hours completed  
target: 0 – Not looking for job change, 1 – Looking for a job change


# Exploratory data analysis

Three variables are numeric, every other one is categorical. 

In [None]:
df_eda.describe().T.round(3)

There are also some ordinal variables that we will encode into integers later.

In [None]:
df_eda.describe(include=['O']).T

Converting the target variable in a Yes/No variable, for a better visualization

In [None]:
df_eda["target"] = df_eda["target"].apply(lambda x: "Yes" if x == 1 else  "No")

In [None]:
trace0 = go.Histogram(
    x=df_eda.loc[df_eda['target'] == 'No']['city_development_index'], name='Does not want to change',
    opacity=0.55
)
trace1 = go.Histogram(
    x=df_eda.loc[df_eda['target'] == 'Yes']['city_development_index'], name='Wants to change',
    opacity=0.55
)

data = [trace0, trace1]
layout = go.Layout(barmode='overlay', title='City development index distribution', template = "plotly_white")
fig = go.Figure(data=data, layout=layout)

fig.update_layout(
    title="Distribution of data scientists in cities with different development index",
    xaxis_title="City development index",
    yaxis_title="Count",
    font=dict(
        family="Segoe UI",
        size=13
    )
)

iplot(fig)

In the most developed cities, less data scientists want to change their job.

In [None]:
def plot_bars(x):
    df1 = df_eda[[x,"target"]]
    df1["count"] = 1
    df1 = df1.groupby([x,"target"]).sum().reset_index()

    fig = px.bar(
        df1, 
        y=x, 
        x="count", 
        color="target", 
        title="Data scientists by " + x, 
        template = "plotly_white",  
        width=700, 
        height = 500, 
        color_discrete_sequence=px.colors.qualitative.Vivid_r,
    )

    fig.update_layout(
        barmode='stack', 
        yaxis={'categoryorder':'total ascending'},
        xaxis={'categoryorder':'total descending'},
        xaxis_title="",
        yaxis_title="",
        font=dict(
        family="Segoe UI",
        size=13
    )
    )
    if x == "experience":
        fig.update_layout(yaxis={'categoryorder':'array', 'categoryarray':['<1','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','>20']},
        font=dict(
        family="Segoe UI",
        size=10
    ))
    elif x == "company_size":
        fig.update_layout(yaxis={'categoryorder':'array', 'categoryarray':['<10','10/49','50-99','100-500','500-999','1000-4999','5000-9999','10000+']})
    elif x == "last_new_job":
        fig.update_layout(yaxis={'categoryorder':'array', 'categoryarray':['never','1','2','3','4','>4']})

    fig.show()

In [None]:
for x in df_eda.describe(include=['O']).T.index[1:-1]:
    plot_bars(x) 

Most of the data scientists in the sample: 
- Are male
- Have relevant experience
- Are not currently enrolled in a university course
- Are graduate in a STEM discipline
- Have more than 20 years of work experience
- Work in a private company
- Work in a medium sized company
- Got their new job in the last year

In [None]:
df1 = df_eda[["city","target"]]
df1["count"] = 1
df1 = df1.groupby(["city","target"]).sum().reset_index()
fig = px.bar(df1, x="city", y="count", color="target", title="Data scientists by " + "city", template = "plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid_r)
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'},
    xaxis_title="",
    yaxis_title="",
    font=dict(
        family="Segoe UI",
        size=13
    )
)
fig.show()


The observations are not homogeneous: most of the data scientists live in few cities. This will be important to know later.

# Feature engineering and data preparation

I'm merging train and test datasets to process the data. They will be divided later in the same way that kaggle provided them. 

In [None]:
df = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")
df = df.append(test)

** City variable**  
There are 123 unique cities, but most of the data scientists live in the first 10. I choose to set to "Other" every city that has less than 300 observations. This will result in having less features after doing the one hot encoding.

In [None]:
df["city"] = df["city"].apply(lambda x: "Other" if len(df[df.city.eq(x)]) < 300 else x)
city = pd.DataFrame(df["city"].value_counts())

In [None]:
city = pd.DataFrame(df["city"].value_counts())
print(city)

# Encoding ordinal variables  
Ordinal variables are categorical variables that have an order. I'm encoding them to use them as numerical variables.

In [None]:
df.replace(to_replace = 'Has relevent experience',value = '1',inplace = True)
df.replace(to_replace = 'No relevent experience',value='0',inplace = True )

df.replace(to_replace = '<1',value = '0',inplace = True)
df.replace(to_replace = '>20',value = '21',inplace=True)
df.replace(to_replace = 'never',value = '0',inplace=True)
df.replace(to_replace = '>4',value = '5',inplace=True)

df['company_size'].replace(to_replace = '<10',value = '0',inplace = True)
df['company_size'].replace(to_replace = '10/49',value = '1',inplace = True)
df['company_size'].replace(to_replace = '50-99',value = '2',inplace = True)
df['company_size'].replace(to_replace = '100-500',value = '3',inplace = True)
df['company_size'].replace(to_replace = '500-999',value = '4',inplace = True)
df['company_size'].replace(to_replace = '1000-4999',value = '5',inplace = True)
df['company_size'].replace(to_replace = '5000-9999',value = '6',inplace = True)
df['company_size'].replace(to_replace = '10000+',value = '7',inplace = True)


df.replace(to_replace = 'Primary School',value = '0',inplace=True)
df.replace(to_replace = 'High School',value = '1',inplace=True)
df.replace(to_replace = 'Graduate',value = '2',inplace=True)
df.replace(to_replace = 'Masters',value = '3',inplace=True)
df.replace(to_replace = 'Phd',value = '4',inplace=True)


**Converting variables to the right type**  

The columns are still of type "object", so they need to be converted in an integer type. Int64 allows for NaN values.

In [None]:
df['relevent_experience'] = df['relevent_experience'].astype(float).astype("Int64")
df['experience'] = df['experience'].astype(float).astype("Int64")
df['last_new_job'] = df['last_new_job'].astype(float).astype("Int64")
df['company_size'] = df['company_size'].astype(float).astype("Int64")
df['education_level'] = df['education_level'].astype(float).astype("Int64")
df['training_hours'] = df['training_hours'].astype(float).astype("Int64")


# Missing values  
There are some missing value in the data that need to be imputed or dropped.

In [None]:
missing = pd.DataFrame(df.isna().sum().reset_index()).rename(columns={0: "Missing"})
missing = missing.loc[missing['Missing'] > 1]
missing['Missing'] = missing['Missing'].apply(lambda x: (x/len(df))*100)

fig = px.bar(
    missing, 
    y="index", 
    x="Missing", 
    template = "plotly_white",
    width=700, 
    height=400, 
    color_discrete_sequence=px.colors.qualitative.Vivid
    )

fig.update_layout(
    title="Percentage of missing values",
    xaxis_title="",
    yaxis_title="",
    yaxis={'categoryorder':'total ascending'},
    font=dict(
        family="Segoe UI",
        size=13
    )
)

fig.show()

### Imputing missing values  
I'm using two imputers for the numerical and categorical variables. After different attempts, I find out that the most effective strategy is to impute every numerical variable with the mean of the columns, and every categorical with the most frequent observation.

In [None]:
num_imputer = SimpleImputer()
cat_imputer = SimpleImputer(strategy = "most_frequent")
df_num = ['city_development_index', 'relevent_experience', 'education_level', 'experience', 'company_size', 'last_new_job', 'target','training_hours']
df_cat = ['city', 'gender', 'enrolled_university', 'major_discipline', 'company_type',]
df[df_num] = num_imputer.fit_transform(df[df_num])
df[df_cat]= cat_imputer.fit_transform(df[df_cat])

**One hot encoding categorical variables**  
After the one hot encoding there are 37 features in total.

In [None]:
df = pd.get_dummies(df, columns = df_cat)
df.shape

**Dividing train and submission sets again**  
After the feature engineering the sets are divided again the same splits that they were provided. I also dropped the enrollee_id variable because it's not a feature.  
The X_sub dataset will be used later for the submission.

In [None]:
X = df.drop(columns = ['target', 'enrollee_id'])
feature_names = X.columns
y = df.target
X = X.head(19158)
X_sub = X.tail(2129)
y = y.head(19158)

# Splitting test and train datasets  
 
I am splitting a test/train dataset with a ratio of 80/20. The test dataset will be kept to evaluate performance on the model after.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Model building

**LightGBM Classifier**  
![](https://repository-images.githubusercontent.com/64991887/dc855780-e34b-11ea-9ab8-e08ca33288b0)

My algorithm of choice is the Light Gradient Boosting for its speed and performance. LightGBM is a gradient boosting framework made by Microsoft that uses tree based learning algorithms.

![](https://miro.medium.com/max/1618/0*4nrDSJJcTHNjMjmb.png)

One of the main changes from all the other GBMs, like XGBoost, is the way tree is constructed. In LightGBM, a leaf-wise tree growth strategy is adopted. In LightGBM, the leaf-wise tree growth finds the leaves which will reduce the loss the maximum, and split only that leaf and not bother with the rest of the leaves in the same level. This results in an asymmetrical tree where subsequent splitting can very well happen only on one side of the tree.

Leaf-wise tree growth strategy tend to achieve lower loss as compared to the level-wise growth strategy, but it also tends to overfit, especially small datasets.

[For more informations](https://deep-and-shallow.com/2020/02/21/the-gradient-boosters-iii-lightgbm/)

We will tune the parameters to assure that the model is not overfitting.


## Baseline model

This is the baseline model. I will compare it to the tuned model to see if parameters tuning improve it. We can immediately see that the recall score is not good, so the predictions for the Yes class are not very reliable. It's expected for imbalanced datasets, but it will be improved later.

Every score will be computed with an 8 fold cross validation with stratified splits.

**Defining CV splits**

In [None]:
cv = StratifiedKFold(n_splits = 8)

**Training the model**

In [None]:
lgbm = LGBMClassifier().fit(X_train,y_train)
y_pred = lgbm.predict(X_test)
print(classification_report(y_test,y_pred))
plot_confusion_matrix(lgbm, X_test,y_test)
scores = cross_val_score(lgbm,X_train, y_train, cv = cv, scoring = 'roc_auc')
print("AUC score on test data ", roc_auc_score(y_test, y_pred).round(3))
print("AUC score in each fold of a 8 fold cross validation: ")
print(scores)
print("Mean: ", scores.mean().round(3), "\nStandard deviation: ", np.sqrt(scores.var()).round(5))


The 8 fold cross validation shows some slight overfitting.

# Dealing with the unbalanced dataset

In [None]:
balance = pd.DataFrame(df_eda['target'].value_counts()).reset_index()

balance
fig = px.pie(
    balance, 
    names="index", 
    values="target", 
    template = "plotly_white",
    width=700, 
    height=400, 
    color_discrete_sequence=px.colors.qualitative.Vivid
    )

fig.update_layout(
    title="Proportion of yes and no",
    xaxis_title="",
    yaxis_title="",
    yaxis={'categoryorder':'total ascending'},
    font=dict(
        family="Segoe UI",
        size=13
    )
)

fig.show()

The dataset is unbalaced. Only 24.9% of the data scientists in the sample are looking to change their job.

## SMOTE + K Fold cross validation

Since the dataset is unbalanced, it might be beneficial to run SMOTE to make it balanced by oversampling the minority and downsampling the majority class.

**IMPORTANT: in this case the  SMOTE needs to be applied on each fold of the Kfold CV. Not doing this will result in training data leaking as test data, making the model invalid.**

In [None]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.metrics import f1_score

kf = StratifiedKFold(n_splits=8)

for fold, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold = X_train.iloc[train_index]
    y_train_fold = y_train.iloc[train_index]  # Based on your code, you might need a ravel call here, but I would look into how you're generating your y
    X_test_fold = X_train.iloc[test_index]
    y_test_fold = y_train.iloc[test_index]  # See comment on ravel and  y_train
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train_fold, y_train_fold)
    model = LGBMClassifier()
    model.fit(X_train_oversampled, y_train_oversampled )  
    y_pred = model.predict(X_test)
    print(f'For fold {fold}:')
    print(f'Accuracy: {model.score(X_test, y_test).round(3)}')
    print(f'f-score: {f1_score(y_test, y_pred).round(3)}')
    print(f'AUC: {roc_auc_score(y_test, y_pred).round(3)}')

There are apparently no improvement. We'll go on with the unbalanced dataset using the **is_unbalance = True** parameter in lightgbm.

### A parameter for unbalanced datasets: `is_unbalance`

In [None]:
lgbm = LGBMClassifier(is_unbalance = True).fit(X_train,y_train.ravel())
y_pred = lgbm.predict(X_test)
print(classification_report(y_test,y_pred))
plot_confusion_matrix(lgbm, X_test,y_test)
scores = cross_val_score(lgbm,X_train, y_train, cv = cv, scoring = 'roc_auc')
print("AUC score on test data ", roc_auc_score(y_test, y_pred).round(3))
print("AUC score in each fold of a 8 fold cross validation: ")
print(scores)
print("Mean: ", scores.mean().round(3), "\nStandard deviation: ", np.sqrt(scores.var()).round(5))


Adding this parameters yielded very good results. The AUC on test data went from 0.71 to 0.76. But most importantly, the recall score improved a lot: it went from 0.55 to 0.73. The precision fot the majority class also increased.  
The AUC on the KFold splits is to see if the model overfits, while the AUC on test data is useful to understand how the model behaves on out of sample data. 

# Features importance  

## Which variables influence data scientists the most?  

In the following plot we can see which are the most important features for the model predictions. 

In [None]:
feature_imp = pd.DataFrame(sorted(zip(lgbm.feature_importances_,X.columns)), columns=['Value','Feature'])

fig = px.bar(feature_imp, y="Value", x="Feature", template = "plotly_white",  width=700, height=500, color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_layout(
    title="Feature importance",
    xaxis_title="",
    yaxis_title="",
            xaxis={'categoryorder':'total descending'},
    font=dict(
        family="Segoe UI",
        size=11
    )
)

fig.show()


It looks like the variables that influence a data scientist the most in their decision to change his job are training hours, experience, and city development index.

# Feature selection

It looks like there are features with very low importance. With the following loop I'm testing each threshold for removing the least important features. It will make the model faster and potentially more precise.

In [None]:
th = []
sdlist = []
auc = []
for x in range(0,100):
    selection = SelectFromModel(lgbm, threshold=x, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = LGBMClassifier(is_unbalance = True)
    selection_model.fit(select_X_train, y_train)
    # test model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    score = cross_val_score(lgbm,select_X_train, y_train, cv = cv, scoring = 'roc_auc')
    aucValue = score.mean()
    sdValue = np.sqrt(score.var())
    th.append(x)
    auc.append(aucValue)
    sdlist.append(sdValue)
d = {'Threshold': th, 'AUC': auc, 'SD': sdlist}
scores = pd.DataFrame(data=d)

In [None]:
import plotly.express as px
fig = px.line(scores, x="Threshold", y="AUC", width = 800, height = 400, template= "plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_layout(
    title="AUC change with different feature selection thresholds",
    xaxis_title="",
    yaxis_title="",
    font=dict(
        family="Segoe UI",
        size=10,
        
    )
)

fig.show()

In [None]:
import plotly.express as px
fig = px.line(scores, x="Threshold", y="SD", width = 800, height = 400, template= "plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_layout(
    title="8 fold standard deviation change with different feature selection thresholds",
    xaxis_title="",
    yaxis_title="",
    font=dict(
        family="Segoe UI",
        size=10,
        
    )
)

fig.show()

The graph shows that removing the best threshold for removing features. Doing this will results in a faster and more performant model.

**Filtering the features and retraining the model with 19 as feature threshold**

In [None]:
selection = SelectFromModel(lgbm, threshold=18, prefit=True)
select_X_train = selection.transform(X_train)
select_X_test = selection.transform(X_test)

In [None]:
selection_model = LGBMClassifier(is_unbalance = True)
selection_model.fit(select_X_train, y_train)

y_pred = selection_model.predict(select_X_test)
plot_confusion_matrix(selection_model, select_X_test,y_test)
print(classification_report(y_test,y_pred))
scores = cross_val_score(selection_model,select_X_train, y_train, cv = cv, scoring = 'roc_auc')
print("AUC score on test data ", roc_auc_score(y_test, y_pred).round(3))
print("AUC score in each fold of a 8 fold cross validation: ")
print(scores)
print("Mean: ", scores.mean().round(3), "\nStandard deviation: ", np.sqrt(scores.var()).round(5))

Removing less important features didn't decrease the AUC, but decreased the variance and made the model faster. 

# Hyperparameters optimization

## Tuning `max_depth` and `num_leaves`

Those are the most sensible parameters in gradient boost and need to be tuned first.
`max_depth` is the parameter that dictates the maximum depth that each tree in a boosting round can grow to. 
`num_leaves` is the maximum number of leaves a tree can have.  

Both are useful to control overfitting.

In [None]:
from sklearn.model_selection import GridSearchCV

clf = LGBMClassifier(is_unbalance = True)

param_grid = {
        'max_depth' : range(1,20),
 }

rs_clf = GridSearchCV(clf, param_grid,
                            n_jobs=-1, verbose=2, cv= cv,
                            scoring='roc_auc', refit=False)

rs_clf.fit(select_X_train, y_train)
best_score = rs_clf.best_score_
best_params = rs_clf.best_params_ 
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

LightGBM documentation suggests value of `num_leaves` of a maximum of 2^`max_depth`-1

In [None]:
from sklearn.model_selection import GridSearchCV

clf = LGBMClassifier(max_depth = 4, is_unbalance = True)

param_grid = {
        'num_leaves' : range(2,15),
 }

rs_clf = GridSearchCV(clf, param_grid,
                            n_jobs=-1, verbose=2, cv= cv,
                            scoring='roc_auc', refit=False)

rs_clf.fit(select_X_train, y_train)
best_score = rs_clf.best_score_
best_params = rs_clf.best_params_ 
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

## Tuning other hyperparameters with RandomizedSearchCV


Randomized search is faster than grid search because it doesn't try every possible value, it take random values from the distributions I specified. It also performs a built in 8 fold cross validation.

I'm manually setting the parameters we've found so far, and I'll search for the others. For the search I will use the default learning rate of 0.1 so it will be faster.

The parameters I choose to tune are:
- `min_data_in_leaf`: This is is a way to reduce overfitting. It requires each leaf to have the specified number of observations so that the model does not become too specific.
- `min_gain_to_split`: When adding a new tree node, LightGBM chooses the split point that has the largest gain. Gain is basically the reduction in training loss that results from adding a split point. 
- Adding some regularization through `lambda_l2`.

In [None]:
clf = LGBMClassifier(is_unbalance = True, max_depth = 4, num_leaves = 9, feature_fraction_seed = 42)

param_grid = {
        'min_gain_to_split' : scipy.stats.uniform(0,3),
        'lambda_l2' : scipy.stats.uniform(0,3),
        'min_data_in_leaf': scipy.stats.randint(30,150)
 }

rs_clf = RandomizedSearchCV(clf, param_grid,
                            n_jobs=-1, verbose=2, cv= cv,
                            random_state=42, n_iter = 1000)
print("Randomized search..")
search_time_start = time.time()
rs_clf.fit(select_X_train, y_train)
print("Randomized search time:", time.time() - search_time_start)

best_score = rs_clf.best_score_
best_params = rs_clf.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

## Tuning `learning_rate`

The learning rate is a tuning parameter in an optimization algorithm that determines the step size at each iteration while moving toward a minimum of a loss function. Higher learning rate means faster model, at the risk of being less precise. Setting learning rate too low might lead to overfitting.

In [None]:
clf = LGBMClassifier(is_unbalance = True, 
                                 max_depth = 4, 
                                 num_leaves = 9, 
                                 feature_fraction_seed = 42,
                                **best_params
                    )

param_grid = {
        'learning_rate' : scipy.stats.uniform(0,0.3)
 }

rs_clf = RandomizedSearchCV(clf, param_grid,
                            n_jobs=-1, verbose=2, cv=cv,
                            scoring='roc_auc', refit=False, random_state=42, n_iter = 100)
print("Randomized search..")
search_time_start = time.time()
rs_clf.fit(select_X_train, y_train)
print("Randomized search time:", time.time() - search_time_start)

best_score = rs_clf.best_score_
best_learning_rate = rs_clf.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_learning_rate.keys()):
    print('%s: %r' % (param_name, best_learning_rate[param_name]))

# The final model  

Random search optimization further improved the mean AUC score but slightly increased the standard deviation. Let's retrain it with the best parameters found.

In [None]:
final_model = LGBMClassifier(is_unbalance = True, 
                                 max_depth = 4, 
                                 num_leaves = 9, 
                                 feature_fraction_seed = 42,
                                 **best_params,
                                 **best_learning_rate
                                 
                                )
final_model.fit(select_X_train, y_train)

y_pred = final_model.predict(select_X_test)

scores = cross_val_score(final_model,select_X_train, y_train, cv = cv, scoring = 'roc_auc')


The hyperparameter tuning resulted in some improvement in the recall score for the minority class: this is important for an imbalanced dataset.

The mean AUC on the cross validation folds, and the AUC on test data also increased.

# Model evaluation

In [None]:
plot_confusion_matrix(selection_model, select_X_test,y_test)
print(classification_report(y_test,y_pred))
print("AUC score on test data ", roc_auc_score(y_test, y_pred).round(3))
print("AUC score in each fold of a 8 fold cross validation: ")
print(scores)
print("Mean: ", scores.mean().round(3), "\nStandard deviation: ", np.sqrt(scores.var()).round(5))

The report shows that:
- 91% of the predicted No are actually No. 53% of the predicted Yes are actually Yes. 
- 78% of the total No are predicted as No. 75% of the total Yes are predicted Yes.  

While the precision for the Yes class might seem low, it's expected since our test train is unbalanced. Since we have more negatives than positives, the higher number of True Negatives will influence the number of False Negatives too, resulting in a lower recall.  

We will see that our model with 0.8 AUC is substantially better than the dummy classifier.

### Comparison with the dummy model  
Since the test set is unbalanced, it makes sense to compare our model to the dummy model which classifies every instance at random.

In [None]:
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy = "uniform").fit(X_train,y_train)

y_pred_dummy = dummy.predict(X_test)

print('AUC: ', roc_auc_score(y_test,y_pred_dummy).round(3))

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_dummy))


plot_confusion_matrix(dummy, X_test,y_test)

When the dataset in unbalanced, looking to a single metric can be misleading. For example, the dummy model has a precision score for the 0 class for 0.75. This means that it's getting 75% of the predictions right guessing completely at random. In this case a precision score for my model of 0.7 would be bad.

However, since the 1 class is the minority, guessing at random can only get 25% of the minority class results correctly.

## Model evaluation on the test set

Testing on the test set shows an AUC score of 0.76

In [None]:
# roc curve and roc auc on an imbalanced dataset
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
 
# plot no skill and model roc curves
def plot_roc_curve(test_y, naive_probs, model_probs):
	# plot naive skill roc curve
	fpr, tpr, _ = roc_curve(test_y, naive_probs)
	pyplot.plot(fpr, tpr, linestyle='--', label='No Skill')
	# plot model roc curve
	fpr, tpr, _ = roc_curve(test_y, model_probs)
	pyplot.plot(fpr, tpr, marker='.', label='LightGBM Model')
	# axis labels
	pyplot.xlabel('False Positive Rate')
	pyplot.ylabel('True Positive Rate')
	# show the legend
	pyplot.legend()
	# show the plot
	pyplot.show()
 
yhat = dummy.predict_proba(X_test)
naive_probs = yhat[:, 1]
# calculate roc auc
roc_auc = roc_auc_score(y_test, y_pred_dummy)
print('No Skill ROC AUC %.2f' % roc_auc)
# # skilled model
# lgbm.fit(X_train, y_train)
yhat = selection_model.predict_proba(select_X_test)
model_probs = yhat[:, 1]
# calculate roc auc
roc_auc = roc_auc_score(y_test, y_pred)
print('LightGBM ROC AUC on test data %.3f' % roc_auc)
# plot roc curves
plot_roc_curve(y_test, naive_probs, model_probs)

The blue line represents the "no skill" model, while the orange line is the LightGBM model. The area under the ROC curve is 0.77, which is very good compared to the no skill.

# Making submission

In [None]:
X_sub = selection.transform(X_sub)

predict = final_model.predict_proba(X_sub)
predict = predict[:, 1]

enrollee = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")

submission = pd.DataFrame({'enrollee_id':enrollee['enrollee_id'],'target':predict})

submission.to_csv('submission.csv',index=False)


# Comments and suggestions are very welcome!