In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import pylab as pl


import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier 
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV, validation_curve, cross_val_score
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, roc_curve, accuracy_score, confusion_matrix, log_loss, plot_roc_curve, auc, precision_recall_curve, classification_report
from sklearn.utils import shuffle
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.class_weight import compute_sample_weight
from eli5.sklearn import PermutationImportance
import eli5
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import pickle
import warnings
import numpy as np
from sklearn import linear_model, metrics, pipeline, preprocessing
from sklearn.impute import KNNImputer

warnings.filterwarnings("ignore")

In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Features

* enrollee_id : Unique ID for enrollee
* city: City code
* citydevelopmentindex: Developement index of the city (scaled)
* gender: Gender of enrolee
* relevent_experience: Relevent experience of enrolee
* enrolled_university: Type of University course enrolled if any
* education_level: Education level of enrolee
* major_discipline :Education major discipline of enrolee
* experience: Enrolee total experience in years
* company_size: No of employees in current employer's company
* company_type : Type of current employer
* lastnewjob: Difference in years between previous job and current job
* training_hours: training hours completed
* target: 0 – Not looking for job change, 1 – Looking for a job change



# Training Data

In [None]:
raw_train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
raw_test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
# sub = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv')

In [None]:
raw_train.head()

In [None]:
raw_test.head()

Let's check maches of ID in train and test data.

In [None]:
set(raw_train.enrollee_id)&set(set(raw_test.enrollee_id))

# Target

* 0 – Not looking for job change, 
* 1 – Looking for a job change

We have a big imbalance in data.  

In [None]:
graf = sns.countplot(y="target", data=raw_train, alpha=0.8)
plt.xlabel('Number of Data', fontsize=12)
plt.ylabel('target', fontsize=12)

plt.show()

# Bar plots

I think that having relevant experience is important if you want to change jobs. Let/s look on the data in this view.

In [None]:
plt.figure(figsize=[15,17])
fft=["gender", "relevent_experience", "education_level", "major_discipline", "experience", "company_size", "company_type", "target"]
n=1
for f in fft:
    plt.subplot(4,2,n)
    sns.countplot(x=f, hue='relevent_experience', edgecolor="black", alpha=0.7, data=raw_train)
    sns.despine()
    plt.title("Countplot of {}  by relevent_experience".format(f))
    n=n+1
plt.tight_layout()
plt.show()


    
plt.figure(figsize=[15,4])
sns.countplot(x='experience', hue='education_level',edgecolor="black", alpha=0.7, data=raw_train)
sns.despine()
plt.title("Countplot of experience by education_level")
plt.show()

I note that these are the largest groups:
- men
- students
- STEM specialization, maybe people have more choice
- current company type PVT LTD
- over 20 years of experience

# Education level

In [None]:
education_graf = sns.countplot(x='education_level', alpha=0.8, data=raw_train)
plt.ylabel("Number of Data", fontsize=12)
plt.xlabel("Education level", fontsize=10)
plt.show()

# City development index

<img src="https://www.researchgate.net/profile/Lubna_Hasan/publication/24115086/figure/tbl4/AS:668624478019607@1536423906970/Calculation-of-CDI-by-UN-HABITAT-GUIP-Index-Formula.png" width="600">

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

cd = raw_train['city_development_index'].value_counts().reset_index()
cd.columns = [
    'city_development_index', 
    'count'
]
cd['city_development_index'] = cd['city_development_index'].astype(str) + '-'
cd = cd.sort_values(['count']).tail(50)

fig = px.bar(
    cd, 
    x='count', 
    y='city_development_index', 
    orientation='h', 
    title='Count: City development index', 
    width=1000,
    height=900 
)

fig.show()

f, axes = plt.subplots(1,1, figsize = (16, 5))
g1 = sns.distplot(raw_train["city_development_index"], color="red",ax = axes)
plt.title("Distributional of city_development_index")
plt.show()

In [None]:
sns.boxplot(x='target', y='city_development_index', data=raw_train)
plt.show()

The probability of a new job search increases when **city development index** is lower.

In [None]:
plt.figure(figsize=[10, 5])
sns.boxplot(x='company_size', y='city_development_index', data=raw_train)
plt.xlabel('Company size')
plt.ylabel('City development index')
plt.show()

Big companies usually are placed in cities with a high rating of **city development index**.

# Training hours

In [None]:
sns.displot(x='training_hours',
            hue='target',
            data=raw_train,
            stat="probability")
plt.show()

**Training hours** don't provide new correlations from the data.

# Feature Engineering

There is a point that young students are inclined to look for a new job, but we do not have a person's age in the dataset, but we can potentially designate it using several features.

In [None]:
raw_data_age = raw_train[['enrollee_id', 'education_level', 'experience', 'last_new_job']]\
    .groupby(['education_level', 'experience', 'last_new_job']).sum('enrollee_id').reset_index()

In [None]:
raw_data_age[:5]

People with PhD and extensive experience cannot be in a group under 20 years old because this  PhD and Master's degree are not for young specialists.

Let's take an approximate time frame for age categories, if there is more data, it can be made more accurate than focusing on education and experience.

- young < 20 y.o.
- adult - 20-40 y.o.
- middle - 40-60 y.o.
- old - >60 y.o.

In [None]:
def get_age_category(x, y):
    if ((x in ['1', '2', '3', '4', '5', '6', '<1', '7', '8']) &
            ((y in ['Phd']) | (y in ['Masters']) | (y in ['Graduate']))):
        return 'Adult'
    elif ((x in ['1', '2', '3', '<1']) & (y != 'Phd') & (y != 'Masters') &
          (y != 'Graduate')):
        return 'Young'
    elif (x in ['9', '10', '11', '12', '13', '14', '15']):
        return 'Middle'
    elif (x in ['16', '17', '18', '19', '20', '>20']):
        return 'Old'

In [None]:
raw_train['age'] = raw_train.apply(lambda row: get_age_category(row['experience'], row['education_level']), axis=1)

In [None]:
raw_train.age.value_counts()

# Correlation in Data

Here I try to measure correlation in data using Correlation coefficients.

Correlation coefficientsare used to measure how strong a relationship is between two variables.Correlation coefficient formulas are used to find how strong a relationship is between data. The formulas return a value between -1 and 1, where:
 
* 1 indicates a strong positive relationship.
* -1 indicates a strong negative relationship.
* A result of zero indicates no relationship at all.

In [None]:
corr=raw_train.corr()["target"]
corr[np.argsort(corr, axis=0)[:-1]]

Below we are plotting heatmap showing nullity correlation between various columns of dataset.

The nullity correlation ranges from -1 to 1.

* -1 - Exact Negative correlation represents that if the value of one variable is present then the value of other variables is definitely absent.
* 0 - No correlation represents that variables values present or absent do not have any effect on one another.
* 1 - Exact Positive correlation represents that if the value of one variable is present then the value of the other is definitely present.

In [None]:
import missingno
missingno.heatmap(raw_train, cmap="RdYlGn", figsize=(10,5), fontsize=12);

# Prerpocessing

Here I find unique names of columns of objects and create functions that convert all values into numbers.

In [None]:
cols_object = list(raw_train.dtypes[raw_train.dtypes =='object'].index)
t = 0
for i in cols_object:
    cols_object[t] = raw_train[i].unique()  
    cols_object[t] = [i for i in cols_object[t] if i is not np.nan]
    t = t+1
    
cols_object[-1] = [i for i in cols_object[-1] if i is not  None]

In [None]:
print(cols_object)

In [None]:
cols = list(raw_train.dtypes[raw_train.dtypes =='object'].index)

print(cols)

In [None]:
names = {
    'city': 0, 
    'gender': 1, 
    'relevent_experience': 2, 
    'enrolled_university': 3, 
    'education_level': 4, 
    'major_discipline': 5, 
    'experience': 6, 
    'company_size': 7, 
    'company_type': 8, 
    'last_new_job': 9, 
    'age': 10
}

_dict = {key: {cols_object[names[key]][i]: i for i in range(len(cols_object[names[key]]))} for key in cols}

for key in cols:
    raw_train[key] = raw_train[key].map(_dict[key]) 

In [None]:
raw_train.info()

# Fill missing values

In [None]:
print("Any missing sample in training set:",raw_train.isnull().values.any())

In [None]:
raw_train.isna().mean()[raw_train.isna().mean() > 0] * 100

In [None]:
missing_cols = raw_train.columns[raw_train.isna().any()].tolist()

missing_cols

I choose to use a k-nearest neighbour method for missing values.To do this, divide the dataframe into columns with and without missing values.

In [None]:
#dataframe having features with missing values
df_missing = raw_train[['enrollee_id'] + missing_cols]

#dataframe having features without missing values
df_non_missing = raw_train.drop(missing_cols, axis = 1)

In [None]:
knn = KNNImputer(n_neighbors = 3)
X = np.round(knn.fit_transform(df_missing))
df_missing = pd.DataFrame(X, columns = df_missing.columns)

In [None]:
#Let's join both dataframes
train = pd.merge(df_missing, df_non_missing, on = 'enrollee_id')

In [None]:
train

# Testing Data

In [None]:
raw_test.head()

In [None]:
raw_test['age'] = raw_test.apply(lambda row: get_age_category(row['experience'], row['education_level']), axis=1)

In [None]:
raw_test.age.value_counts()

In [None]:
for key in cols:
    raw_test[key] = raw_test[key].map(_dict[key]) 


raw_test

In [None]:
print("Any missing sample in test set:",raw_test.isnull().values.any(), "\n")

In [None]:
raw_test.isna().mean()[raw_test.isna().mean() > 0] * 100

I choose to use KNN method for defining missing values.

In [None]:
#dataframe having features with missing values
df_missing_test = raw_test[['enrollee_id'] + missing_cols]

#dataframe having features without missing values
df_non_missing_test = raw_test.drop(missing_cols, axis = 1)

In [None]:
knn = KNNImputer(n_neighbors = 3)
X = np.round(knn.fit_transform(df_missing_test))
df_missing_test = pd.DataFrame(X, columns = df_missing_test.columns)

In [None]:
test = pd.merge(df_missing_test, df_non_missing_test, on = 'enrollee_id')

In [None]:
print("Any missing sample in train set:",train.isnull().values.any(), "\n")
print("Any missing sample in test set:",test.isnull().values.any(), "\n")


# Model

In [None]:
from sklearn.model_selection import train_test_split
y = train['target']
X = train.drop(columns=['target', 'enrollee_id', 'city_development_index', 'training_hours'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=9)


In [None]:
from xgboost import XGBClassifier
clf_XGB = XGBClassifier()

clf_XGB.fit(X_train, y_train, eval_metric='logloss')

y_train_pred = clf_XGB.predict_proba(X_train)
y_train_pred_pos = y_train_pred[:,1]

y_val_pred = clf_XGB.predict_proba(X_val)
y_val_pred_pos = y_val_pred[:,1]

auc_train = roc_auc_score(y_train, y_train_pred_pos)
auc_test = roc_auc_score(y_val, y_val_pred_pos)


print('Model params:')
print(clf_XGB.get_params())
print(f"Train AUC Score {auc_train}")
print(f"Test AUC Score {auc_test}")

fpr, tpr, _ = roc_curve(y_val, y_val_pred_pos)

In [None]:
def get_scores(report_df, model, X_val, y_val, name):

    report = pd.DataFrame(columns={'ROC-AUC'}, data=[0])
    report['ROC-AUC'] = roc_auc_score(y_val,
                                      model.predict_proba(X_val)[:, 1])
    report['F1'] = f1_score(y_val, model.predict(X_val))
    report['precision_0'] = precision_score(
        y_val, model.predict(X_val), pos_label=0)
    report['precision_1'] = precision_score(
        y_val, model.predict(X_val), pos_label=1)
    report['recall_0'] = recall_score(
        y_val, model.predict(X_val), pos_label=0)
    report['recall_1'] = recall_score(
        y_val, model.predict(X_val), pos_label=1)

    report.index = [name]
    report_df = report_df.append(report)
    return report_df


In [None]:
df_report = pd.DataFrame()
df_report = get_scores(df_report, clf_XGB, X_val,
                       y_val, 'XGBClassifier KNN')

In [None]:
df_report

As we can see model is overfitting the data, we can do various things to resolve this problem like we can increase data set size in balanced manner and we can also tune hyperparameters of model.

Let's plot AUC Curve.

In [None]:
def plot_auc_curve(fpr, tpr, auc):
    plt.figure(figsize = (16,6))
    plt.plot(fpr,tpr,'b+',linestyle = '-')
    plt.fill_between(fpr, tpr, alpha = 0.5)
    plt.ylabel('True Postive Rate')
    plt.xlabel('False Postive Rate')
    plt.title(f'ROC Curve Having AUC = {auc}')

In [None]:
plot_auc_curve(fpr, tpr, auc_test)

# Learning curve

It is a tool to find out how much a machine model benefits from adding more training data and whether the estimator suffers more from a variance error or a bias error. If both the validation score and the training score converge to a value that is too low with increasing size of the training set, it will not benefit much from more training data.

In [None]:
# funtion to plot learning curves

def plot_learning_cuve(model, X, Y):
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 11)
    train_loss, test_loss = [], []
    
    for m in range(200,len(x_train),200):
        
        model.fit(x_train.iloc[:m,:], y_train[:m], eval_metric='logloss', verbose=False)
        y_train_prob_pred = model.predict_proba(x_train.iloc[:m,:])
        train_loss.append(log_loss(y_train[:m], y_train_prob_pred))
        
        y_test_prob_pred = model.predict_proba(x_test)
        test_loss.append(log_loss(y_test, y_test_prob_pred))
        
    plt.figure(figsize = (15,8))
    plt.plot(train_loss, 'r-+', label = 'Training Loss')
    plt.plot(test_loss, 'b-', label = 'Test Loss')
    plt.xlabel('Number Of Batches')
    plt.ylabel('Log-Loss')
    plt.legend(loc = 'best')



    plt.show()

In [None]:
plot_learning_cuve(XGBClassifier(), X, y)

There is a high variance problem and I need to make more training data.

In [None]:
sns.countplot(y, edgecolor = 'black')
plt.show()

Let's try to increase data in balanced manner using Synthetic Minority Oversampling Technique (SMOTE)

# Oversampling (SMOTE)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 402)
X_smote, y_smote = smote.fit_resample(X, y)


sns.countplot(y_smote, edgecolor = 'black')
plt.show()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_smote, y_smote, test_size = 0.2 ,random_state = 42)

clf_XGB_smote = XGBClassifier()

clf_XGB_smote.fit(X_train, y_train, eval_metric='logloss')

y_train_pred = clf_XGB_smote.predict_proba(X_train)
y_train_pred_pos = y_train_pred[:,1]

y_val_pred = clf_XGB_smote.predict_proba(X_val)
y_val_pred_pos = y_val_pred[:,1]

auc_train = roc_auc_score(y_train, y_train_pred_pos)
auc_test = roc_auc_score(y_val, y_val_pred_pos)

fpr, tpr, _ = roc_curve(y_val, y_val_pred_pos)


print('Model params:')
print(clf_XGB_smote.get_params())
print(f"Train AUC Score {auc_train}")
print(f"Test AUC Score {auc_test}")

In [None]:
df_report = get_scores(df_report, clf_XGB_smote, X_val,
                       y_val, 'XGBClassifier KNN(SMOTE)')


In [None]:
df_report

In [None]:
plot_learning_cuve(XGBClassifier(), X_smote, y_smote)

# CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier

X_train, X_val, y_train, y_val = train_test_split(X_smote, y_smote, test_size = 0.2 ,random_state = 42)

CB_CLASS = CatBoostClassifier(iterations=100,
                           learning_rate=0.1,
                           depth=8,loss_function='Logloss',
                             custom_loss=['AUC', 'Accuracy'],
                             )

CB_CLASS.fit(X_smote, y_smote,
            eval_set=(X_val, y_val),
            verbose=False)

y_train_pred = CB_CLASS.predict_proba(X_train)
y_train_pred_pos = y_train_pred[:,1]

y_val_pred = CB_CLASS.predict_proba(X_val)
y_val_pred_pos = y_val_pred[:,1]

auc_train = roc_auc_score(y_train, y_train_pred_pos)
auc_test = roc_auc_score(y_val, y_val_pred_pos)

print('Model is fitted:' + str(CB_CLASS.is_fitted()))
print('Model params:')
print(CB_CLASS.get_params())
print(f"Train AUC Score {auc_train}")
print(f"Test AUC Score {auc_test}")

In [None]:
df_report = get_scores(df_report, CB_CLASS, X_val,
                       y_val, 'CB_CLASS KNN(SMOTE)')

df_report

# Cross validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = clf_XGB_smote, X = X_smote, y = y_smote, verbose=False, cv = 5)
print("Accuracy:{:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation:{:.2f} %".format(accuracies.std()*100))

In [None]:
df_report

# Hyperparameters of model

In [None]:
CB_CLASS = CatBoostClassifier()

parameters = {'depth'         : [5, 7, 9],
              'learning_rate' : [0.01, 0.05, 0.1],
               'iterations'    : [30, 50, 100]
                 }
grid = GridSearchCV(estimator=CB_CLASS, param_grid = parameters, cv = 5, n_jobs=-1)
grid.fit(X_smote, y_smote, verbose=False)
best_param = grid.best_estimator_

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

In [None]:
CB_CLASS = CatBoostClassifier(iterations=100,
                           learning_rate=0.1,
                           depth=9,loss_function='Logloss')

CB_CLASS.fit(X_smote, y_smote,
            eval_set=(X_val, y_val),
            verbose=False)

y_train_pred = CB_CLASS.predict_proba(X_train)
y_train_pred_pos = y_train_pred[:,1]

y_val_pred = CB_CLASS.predict_proba(X_val)
y_val_pred_pos = y_val_pred[:,1]

auc_train = roc_auc_score(y_train, y_train_pred_pos)
auc_test = roc_auc_score(y_val, y_val_pred_pos)

print('Model is fitted:' + str(CB_CLASS.is_fitted()))
print('Model params:')
print(CB_CLASS.get_params())
print(f"Train AUC Score {auc_train}")
print(f"Test AUC Score {auc_test}")

In [None]:
df_report = get_scores(df_report, CB_CLASS, X_val,
                       y_val, 'CB_CLASS GRID')

df_report

# Prediction

In [None]:
features =["city", "city_development_index", "gender", "relevent_experience", "enrolled_university", "education_level", "major_discipline", "experience", "company_size", "company_type", "last_new_job", "training_hours", "age"]
target = 'target'

In [None]:
#Make predictions using XGB
y_predict_XGB = clf_XGB_smote.predict(X_val)

y_predict_XGB

In [None]:
#Make predictions using CB
y_predict_CB = CB_CLASS.predict(X_val)

y_predict_CB

# Measure AUC


The AUC is an estimate of the probability that a classifier will rank a randomly chosen positive instance higher than a randomly chosen negative instance. Reference : https://www.kdnuggets.com/2010/09/pub-is-auc-the-best-measure.html#:~:text=www.riceanalytics.com-,The%20area%20under%20the%20curve%20(AUC)%20that%20relates%20the%20hit,a%20randomly%20chosen%20negative%20instance.

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_predict_XGB)
metrics.auc(fpr, tpr)

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_predict_CB)
metrics.auc(fpr, tpr)

to be continued :)