# Imports and reading in data

In [None]:
#For handling data
import pandas as pd
import numpy as np
import regex as re

#For vizualization of data
import seaborn as sns
import matplotlib.pyplot as plt

#Our ML algos
from sklearn.linear_model import SGDClassifier
import lightgbm as lgb

#SMOTE for imbalanced data
from imblearn.over_sampling import SMOTE

#Imputing
from sklearn.impute import SimpleImputer

#Encoders
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import MEstimateEncoder

#Splitting data into training and test
from sklearn.model_selection import train_test_split

#Eval functions and model analysis
import eli5
import shap
from eli5.sklearn import PermutationImportance
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#Hyperparameter tuning
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

#Get rid of futurewarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



In [None]:
fp_train = "../input/hr-analytics-job-change-of-data-scientists/aug_train.csv"
fp_test = "../input/hr-analytics-job-change-of-data-scientists/aug_test.csv"

train = pd.read_csv(fp_train)
test = pd.read_csv(fp_test)
print(train.describe())
print(test.describe())

# Exploratory data analysis (EDA)

Start by taking a general look at the columns

In [None]:
train_cols = train.columns
test_cols = test.columns

print("Training columns are:\n {}".format(train_cols))
print("Testing columns are: \n {}".format(test_cols))

Let's take a look at the data to examine data types and also see what they've been classified as by pandas

In [None]:
train

In [None]:
train.dtypes

Using this info we'll manually make a list of the categorical columns and numerical columns

In [None]:
cat_cols = ["city","gender","relevent_experience","enrolled_university","education_level","major_discipline","experience","company_size","company_type","last_new_job"]
num_cols = ["enrollee_id", "city_development_index","training_hours"]

Let's also take a look at our missing values and asses if we should drop them, or impute them.

In [None]:
nulls = train.isnull().sum().sum()
nn = train.notnull().sum().sum()
null_rows = train.shape[0] - train.dropna().shape[0]
total = nulls+nn

print("Training data:")
print("Total null values: {null}" "\n" "Percent of all values: {prc}".format(null=nulls,prc=nulls/nn))
print("Total null rows: {nr}" "\n" "Percent of all rows: {rprc}".format(nr=null_rows,rprc=null_rows / train.shape[0]))

print("\nTesting data:")
nulls = test.isnull().sum().sum()
nn = test.notnull().sum().sum()
null_rows = test.shape[0] - test.dropna().shape[0]
total = nulls+nn

print("Total null values: {null}" "\n" "Percent of all values: {prc}".format(null=nulls,prc=nulls/nn))
print("Total null rows: {nr}" "\n" "Percent of all rows: {rprc}".format(nr=null_rows,rprc=null_rows / test.shape[0]))

We can see that both of the datasets have a massive amount of rows with missing data. \
This means that just dropping missing data would harm the model quite a lot.

What we'll do instead is look at each column and how much nan values affect it and how we can impute them.

In [None]:
# As a reminder take a look at the columns
train.columns

`enrollee_id` should have no nan values so we can skip that column. \
Next we'll see how many nulls each column has in both datasets

In [None]:
for column in train.columns:
    # Only training data has target, skip that
    if column == "target":
        continue
    
    print("train.{col} nulls: {training}" "\ntest.{col} nulls {test}\n".format(
        training=train[column].isnull().sum(),
        test=test[column].isnull().sum(),
        col = column))

The worst offenders off the list are: \
`gender` \
`major_discipline` \
`company_size` \
`company_type` 

Since there's only a few different values for this feature and the missing values are a minority we can simply fill the missing values with a placeholder value. \

Next `major_discipline`. By taking a look at the data we can see that most of the values are "STEM". The next largest value is missing. Because of this we'll just drop the column \

`company_size` and `company_type` are both linked to each other quite strongly. Indeed we see that they have almost the exact same amount of missing values. What we will do, is first encode `company_size`. Then we will impute the missing values. For `company_type` we'll simply fill "missing"

In [None]:
# First fill gender and company type missing
train["gender"].fillna("Missing",inplace=True)
test["gender"].fillna("Missing",inplace=True)
train.company_type.fillna("Missing",inplace=True)
test.company_type.fillna("Missing",inplace=True)

#Drop major_discipline
train.drop("major_discipline",axis=1,inplace=True)
test.drop("major_discipline",axis=1,inplace=True)

# Labeling and OneHotEncoding categorical data

Some of the categorical columns are already very close to being numerical (such as `last_new_job`) \
We'll do some regex to turn these into numerical values. \
For the rest, we'll use OH-encoding. 

## LabelEncoding

The only column we'll LabelEncode is `company_size`. We can label encode this column because the company size does follow a relationship from a smaller size to a bigger one. LabelEncoding this lets the model learn that relationship better.

In [None]:
#Drop null so we don't encode those
train_dropped = train.dropna(subset=["company_size"])
test_dropped = test.dropna(subset=["company_size"])

In order to allow the model to learn the data in a numerical form when label encoded we need to make sure that as the label encoded values go up, it corresponds in the company size going up. That means that we need to setup the classes in the correct order for the encoder.

We coould type them out manually, but that in a real world application that would mean any new value for `company_size` would break the code.

We're going to automate the process of creating the list, so that it will always be up to date and in order.

In [None]:
#Start by creating a dict to store the values and their numerical version in
values = {}

#Loop through the unique values
for value in train_dropped.company_size.unique():
    
    #Check to see if the value has a less than mark
    if re.search("[<]",value):
        #We replace any non-numerical character with whitespace
        num = re.sub("\D"," ",value)
        #We split the string by white spaces. This splits the string into the given numbers
        spl = num.strip().split(" ")
        
        #We take the first number, a.k.a the start of the range and remove one from it
        #We do this so that we can more effectively sort the values
        n = int(spl[0])-1
        #Set the value into the dict, with the numerical version as key
        values[n] = value
        continue
    
    #This time we check if value has a greater than mark
    if re.search("[>]",value):
        #Same process to find and split the number
        num = re.sub("\D"," ",value)
        spl = num.strip().split(" ")
        #This time we add one.
        n = int(spl[0])+1
        values[n] = value
        continue
    
    #If the value is just a purely numerical range
    #We don't need to modify it in any way
    #Just find it
    num = re.sub("\D"," ",value)
    spl = num.strip().split(" ")
    values[int(spl[0])] = value

    
values

In [None]:
#Setup a list of the classes we'll feed the encoder
classes = []

#We keep looping as long as the dict has values
while values:
    #Find the smallest key value
    min_key = min(values.keys())
    #Find the class that matches the smaller key value
    cl = values[min_key]
    #We set this into the list
    classes.append(cl)
    #And remove it from the dict
    del values[min_key]

classes

Now we have a list of the classes ordered from the smallest company size to the largest.

In [None]:
#Set the encode number
count = 0

#Setup columns for encoding
train_dropped["company_size_encoded"] = train_dropped.company_size
test_dropped["company_size_encoded"] = test_dropped.company_size

#Go through each class in order
for cl in classes:
    #Replace the matching class with the encode
    train_dropped.company_size_encoded.replace(cl,count,inplace=True)
    test_dropped.company_size_encoded.replace(cl,count,inplace=True)
    #Up the encode 
    count += 1

In [None]:
#With the encoding done we can drop the normal column
train_dropped.drop("company_size",axis=1,inplace=True)
test_dropped.drop("company_size",axis=1,inplace=True)

In [None]:
# By joining the encoded column into the original dataframe
# We get the encoded values for non-null rows, and null for rows that had originally null
train = train.join(train_dropped.company_size_encoded)
test = test.join(test_dropped.company_size_encoded)

In [None]:
# Now we can impute the encoded row missing values

# Setup imputer
si = SimpleImputer(strategy="median")

#Fit and transform training set
train["company_size_encoded"] = si.fit_transform(np.array(train["company_size_encoded"]).reshape(-1,1))
#Transform test set
test["company_size_encoded"] = si.transform(np.array(test["company_size_encoded"]).reshape(-1,1))

# Now we can drop the original company_size column
train.drop("company_size",axis=1,inplace=True)
test.drop("company_size",axis=1,inplace=True)

## OneHotEncoding manually

We're going to manually OneHotEncode experience,relevent_experience and last_new_job \
There's no point in encoding every single value, since the column is a combination of categorical and numerical data \
These have only a few values that need encoding, we'll turn the rest into numerical values

### Experience

Start by looking at the unique values within the column.

In [None]:
train.experience.unique()

We then setup a function to manually OH-encode experience. \
We do this so we can easily apply this to the train and test data

In [None]:
def experience_oh(df):
    """
    OneHotEncode the experience column
    """
    #Create a new column with the >20 values and same with <1
    df["experience_>20"] = df.experience.eq(">20")
    df["experience_<1"] = df.experience.eq("<1")

    #Then replace the values in the original column with nan
    df["experience"] = df.experience.apply(lambda x: np.nan if x in [">20","<1"] else x)
    
    return df["experience"]

In [None]:
#Apply the function to both dataframes
train["experience"] = experience_oh(train)
test["experience"] = experience_oh(test)
#Check to see if got rid of wanted values
print(train.experience.unique())
print(test.experience.unique())

### last_new_job

Similar process to experience. \
Start by checking unique values

In [None]:
train.last_new_job.unique()

In [None]:
def last_job_oh(df):
    """
    OneHotEncode the last_new_job column smartly
    """
    df["last_new_job_>4"] = df.last_new_job.eq(">4")
    df["last_new_job_never"] = df.last_new_job.eq("never")

    
    df["last_new_job"] = df.last_new_job.apply(lambda x: np.nan if x in [">4","never"] else x)
    
    return df["last_new_job"]

In [None]:
#Apply function to both dataframes
train["last_new_job"] = last_job_oh(train)
test["last_new_job"] = last_job_oh(test)
#Check to see if got rid of wanted values
print(train.last_new_job.unique())
print(test.last_new_job.unique())

### Dropping nan values
We drop nan values instead of imputing them bc we created nan values in the manual encoding. Imputing these would yield bad results.

In [None]:
#Drop nan from both dataframes
train.dropna(inplace=True)
test.dropna(inplace=True)
train.head()

### relevent_experience

Similar to earlier steps, but a little simpler. This column only has two values, which we can describe with just 0 or 1. \
As always start with checking unique values

In [None]:
train.relevent_experience.unique()

We give a value of 0 if they have no experience and a value of 1 if they do

In [None]:
#Check value, if "has relevent experience" turn the value into 1, if not 0
#Apply to both dataframes
train["relevent_experience"] = train.relevent_experience.apply(lambda x: 1 if str(x) == 'Has relevent experience' else 0)
test["relevent_experience"] = test.relevent_experience.apply(lambda x: 1 if str(x) == 'Has relevent experience' else 0)

In [None]:
#Check to make sure we only have 0 and 1
train.relevent_experience.unique()

### Target Encoding

We'll take a look at the amount and rarity of the remaining columns.

In [None]:
train.select_dtypes(["object"]).nunique()

We can see that city has a large amount of different values. Let's take a look at the distribution between them.

In [None]:
train["city"].value_counts()

We have quite a few rarely occuring values. Because of this this feature would be a good fit for target encoding.

In [None]:
train_c = train.copy()
target_train = train_c.pop("target")

enc_train = train_c.sample(frac=0.15)
enc_target = target_train[enc_train.index]

train_pre = train_c.drop(enc_train.index)
target_pre = target_train[train_pre.index]

In [None]:
enc = MEstimateEncoder(cols=["city"],m=6)

enc.fit(enc_train,enc_target)

train_aff = enc.transform(train_pre)
train_aff["target"] = target_train
train = train_aff

### Updating lists

We need to update the categorical and numerical columns lists.

In [None]:
#List the columns we've encoded in the last steps
encoded = ["last_new_job","experience","relevent_experience","company_size","city"]
#Also list the new ones we've created
new = ["last_new_job_>4","last_new_job_never","experience_>20","experience_<1","company_size_encoded"]
for col in encoded:
    cat_cols.remove(col)
for col in new:
    num_cols.append(col)
for col in encoded:
    num_cols.append(col)

num_cols.remove("company_size")
cat_cols.remove("major_discipline")

In [None]:
print("cat_cols: {}\nnum_cols: {}]".format(cat_cols,num_cols))

## Algo OH-Encoding

Now we'll OneHot-encode the rest of the categorical columns. 

We start by creating the OH-Encoder and creating copys of our dataframes.

In [None]:
oh = OneHotEncoder(sparse=False)
#Create copys to work on
train_c = train.copy()
test_c = test.copy()
#Fit and transform on training set
train_c_cats = pd.DataFrame(oh.fit_transform(train_c[cat_cols]))
#Transform test set
test_c_cats = pd.DataFrame(oh.transform(test_c[cat_cols]))

#Put column names back
train_c_cats.columns = oh.get_feature_names(cat_cols)
test_c_cats.columns = oh.get_feature_names(cat_cols)

#Put the index back
train_c_cats.index = train_c.index
test_c_cats.index = test_c.index

#Drop the old categorical values
train_c.drop(cat_cols,axis=1,inplace=True)
test_c.drop(cat_cols,axis=1,inplace=True)

#Create the new dataframes
train_oh = pd.concat([train_c,train_c_cats],axis=1)
test_oh = pd.concat([test_c,test_c_cats],axis=1)

### Checking data types

We can check to make sure we only have numerical data by looking at data types

In [None]:
train_oh.dtypes

We see here that the columns we manually encoded still have `object` datatype. We can fix this by converting all columns to floats.

In [None]:
# Create a dict to store our columns to conver
dtypes = {}

# Go through all the numerical columns
for col in num_cols:
    # Set the column to be converted to float
    dtypes[col] = float

# Convert training and testing data
train_oh = train_oh.astype(dtypes)
# We can only target encode once we have the targets at the end of the file
dtypes.pop("city")
test_oh = test_oh.astype(dtypes)

In [None]:
train_oh.dtypes

In [None]:
test_oh.dtypes

# Examining the target and fixing imbalance

We seperate the target and check how many true values we have compared to false

In [None]:
#Fetch target from training data
y = train_oh.target
#Drop the target from the original training data
train_oh.drop("target",inplace=True,axis=1)

In [None]:
#Now chech for imbalance
sns.countplot(x=y)
f = len([x for x in y if x == 0])
t = len([x for x in y if x == 1])
print(r"False: {f}" "\n" "True: {t}" "\n" "Percent of true: {p:%}".format(f = f, t=t,p= round(t/f,2)))

The dataset has enough of an unbalance that using smote will help us train the algo

In [None]:
sm = SMOTE(random_state=1)
sm_train, sm_target = sm.fit_resample(train_oh,y)

# Model creation, prediction and assesment

## Model creation

We start by splitting the data, so that we can measure the models performance.

In [None]:
x_train,x_test,y_train,y_test = train_test_split(sm_train, sm_target,random_state=1,test_size=0.3)

### Hyperparameter tuning

We'll use hyperparameter tuning to create and tune our model

In [None]:
#Setup the tuner
def bayes_tuner(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6,n_estimators=10000, output_process=False):
    #Prep our data
    lgb_train = lgb.Dataset(data=x_train, label=y_train, free_raw_data=False)
    #Setup the parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        #Get the results with cv and return them
        cv_result = lgb.cv(params, lgb_train, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
    
    #Setup the optimizer
    optimizer = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0),
                                            'num_leaves': (24, 80),
                                            'feature_fraction': (0.1, 0.9),
                                            'max_depth': (5, 30),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 200),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=0)
    
    #Maximize score
    optimizer.maximize(init_points=init_round, n_iter=opt_round)
    
    #Fetch the auc scores
    model_auc=[]
    for model in range(len(optimizer.res)):
        model_auc.append(optimizer.res[model]['target'])
    
    #Fetch the best param
    return optimizer.res[pd.Series(model_auc).idxmax()]['target'],optimizer.res[pd.Series(model_auc).idxmax()]['params']

#Finally run the optimizer to get the best params for our model
opt_params = bayes_tuner(x_train, y_train, init_round=5, opt_round=10, n_folds=4, random_seed=0,n_estimators=10000)
opt_params = opt_params[1]

### Model setup and training

Now we create the model. \
We'll be using LGBMClassifier

In [None]:
opt_params['num_leaves'] = int(round(opt_params['num_leaves'],2))
opt_params['max_depth'] = int(round(opt_params['max_depth'],2))
opt_params['min_data_in_leaf'] = int(round(opt_params['min_data_in_leaf'],2))
opt_params['max_bin'] = int(round(opt_params['max_bin'],2))
opt_params['metric'] = 'auc'
opt_params['objective'] = "binary"
opt_params

In [None]:
training_data = lgb.Dataset(x_train,y_train)
testing_data = lgb.Dataset(x_test,y_test)
num_rounds = 15000
clf = lgb.train(opt_params,training_data,num_rounds,valid_sets=[training_data,testing_data],verbose_eval = 500, early_stopping_rounds=250)

In [None]:
#Make predictions using the testing data
y_pred = clf.predict(x_test,num_iteration=clf.best_iteration)
#Also make predictions with the training data for comparison
y_train_pred = clf.predict(x_train,num_iteration=clf.best_iteration)

## Assesment

We'll asses the models performance using multiple different metrics, starting with roc_auc_score
### roc_auc_score

In [None]:
#Fetch acc score for test and training predictions
acc_test = roc_auc_score(y_test, y_pred)
acc_train = roc_auc_score(y_train,y_train_pred)

print(r"Test accuracy: {test}" "\n" "Train accuracy: {train}".format(test=acc_test,train=acc_train))

### Confusion matrix

We'll also use a confusion matrix to see how many false positives and false negatives we got

In [None]:
#Make preds into 0 or 1
y_pred_rounded = y_pred.round()
#Create the confusion matrix, and normalize results to easily see percentages
cm = confusion_matrix(y_test,y_pred_rounded,normalize="all")
#Create a easy to display version of the confusion matrix
cm_disp = ConfusionMatrixDisplay(confusion_matrix = cm)
#Display the matrix
cm_disp.plot()

# Final submission

Now we'll create the final submission file for the task.\
We then store the predictions in a dataframe with the IDs

In [None]:
# We read in the final targets
final_targets = pd.DataFrame(np.load("../input/job-change-dataset-answer/jobchange_test_target_values.npy"),columns=["target"])

# We need to take out targets for rows that were dropped
for indx in final_targets.index:
    # If the indx is in the preds then we do nothing
    if indx in test_oh.index:
        continue
    else:
        # If it isn't then drop it
        final_targets.drop(index=indx,inplace=True)
    
final_targets

In [None]:
# Make a copy of the dataframe to work on
test_c = test_oh.copy()
# We also make a copy of the targets
targets = final_targets.copy()

# Take the targets out to index by them
target_test = targets.pop("target")
# Take out a sample of the dataframe for encoding
enc_test = test_c.sample(frac=0.3)

# Take a the targets that match the testing data index
enc_target = target_test[enc_test.index]

# Take out the data used for encoding
test_pre = test_c.drop(enc_test.index)
# Take out the targets used for encoding
target_pre = target_test[test_pre.index]

In [None]:
# Setup the encoder
enc = MEstimateEncoder(cols=["city"],m=6)

# Fit the encoder with the data we split for encoding
enc.fit(enc_test,enc_target)

# Use the encoder to transform the pretest data
test_aff = enc.transform(test_pre)
# Set the the test data to the transformed one
test = test_aff

In [None]:
# Make a predictions on the test data
preds = clf.predict(test)

#Turn it into a dataframe, with the corresponding id
submission_preds= pd.DataFrame({'enrollee_id': test.enrollee_id,'target':preds})
submission_preds

In [None]:
# Save the predictions for submission
submission_preds.to_csv('submission_preds.csv',index=False)

In [None]:
# We need to take out targets for rows that were used in encoding
for indx in final_targets.index:
    # If the indx is in the preds then we do nothing
    if indx in test.index:
        continue
    else:
        # If it isn't then drop it
        final_targets.drop(index=indx,inplace=True)
    
final_targets

In [None]:
#Fetch acc score
acc = roc_auc_score(final_targets, preds)

print(r"Test accuracy: {}".format(acc))

# Examining our model

Now that we've set the model up we can take a look at some different metrics to see what parts of the data actually drive the model. \
In a real life scenario this would be the most valuable part.

## SHAP

We'll start by looking at shap values

In [None]:
# Set up a variable to easily look at different samples of data
x = 2
# Setup an explainer
exp = shap.TreeExplainer(clf)

# Fetch a part of the data to look at
plot_data = test[x:x+1]

# Get the shap values for the selected data
shaps = exp.shap_values(plot_data)
# Init JavaScript so we can disp. the plot
shap.initjs()

# Create a plot to see how each feature affects the prediction
shap.force_plot(exp.expected_value[1], shaps[1], plot_data)

## Permutation importance

We can also check permutation impo

In [None]:
# Plot the permutation importance scores for each feature
lgb.plot_importance(clf)

## Decision tree
We can also plot our model into a decision tree.

In [None]:
# Setup a tree graph for the model
graph = lgb.create_tree_digraph(clf)
# Set the graph size to slightly bigger
graph.graph_attr.update(size="110,110")
# Display the graph
graph

From all of these, we can figure that the experience of the applicant is quite important. Based on shap values the more experience an applicant has the lower the chance that they're loking for a job change. It also seems that the training hours spent on an applicant heavily increase the chance that they're looking for a job change. 