In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.inspection import permutation_importance

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [None]:
train_df = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

## Quick look at the data

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe(include='all')

### Questions for data

1. Does city development mean that employees will have more options and thus more chance to look for other jobs.
2. We have features as relevant experience and experience. Are these features related to each other?
3. Does a particular education level have more employees willing to look for other jobs.
4. Companay size and its relation with whether an employee is looking for other jobs.

## EDA

 Looking for null values

In [None]:
train_df.isnull().mean()*100

Looks like enrollee_id column is just a unique id for each employee and hence has no contribution in predicting the target feature. Lets drop it.

In [None]:
train_df.drop('enrollee_id', axis=1, inplace=True)

In [None]:
sns.pairplot(train_df)
plt.show()

In [None]:
def bar_plot(data, col, title=None, display_pct=False, hue=None):
    ax = sns.countplot(data=data, x=col, order=data[col].value_counts().index, hue=hue)
    if title is None:
        plt.title('Distribution of ' + col)
    else:
        plt.title(title)
    plt.xlabel(col)
    if display_pct == True:
        labels = (data[col].value_counts())
        for i, v in enumerate(labels):
            ax.text(i, v + 10, str(v), horizontalalignment='center', size=14)
    plt.show()

Let's check out the relation of city development index with the target feature

In [None]:
plt.figure(figsize=(10, 3))
sns.boxplot(data=train_df, x='city_development_index', y='target', orient='h')
plt.title('Relation of city development index with target feature')
plt.show()

Well the observations are quite contrary to what we assumed. It turns out majority of people looking for job are in cities with low city development index. Now if you think about it, it makes more sense as people in lesser developed cities will like to look for jobs and move to more developed cities.

Looking at some other columns

In [None]:
bar_plot(train_df, 'gender', hue='target')

In [None]:
bar_plot(train_df, 'relevent_experience', hue='target')

Does an experience of more than some years count as relevant or are the unrelated and relevant experience depicts some other information.

In [None]:
no_relevant_exp = train_df[train_df['relevent_experience'] == 'No relevent experience']
bar_plot(no_relevant_exp, 'experience', 'No relevent experience')

In [None]:
relevant_exp = train_df[train_df['relevent_experience'] != 'No relevent experience']
bar_plot(relevant_exp, 'experience', 'Relevent experience')

False alarm! Looks like they both are unrelated as there are employees with more than 20 years of experience in distribution of both employees with relevant experience and no relevant experience

In [None]:
looking_for_job = train_df[train_df['target'] == 1]
not_looking = train_df[train_df['target'] == 0]

In [None]:
def pieplot(data, col, title):
    plt.figure(figsize=(6, 6))
    piedata = data[col].value_counts()
    plt.pie(x=piedata, autopct="%.1f%%", labels=piedata.index)
    plt.title(title)
    plt.show()
    

Does having a full time course in an university mean the employee has more chances of looking for other jobs.


In [None]:
pieplot(looking_for_job, 'enrolled_university', 'Looking for jobs')

In [None]:
pieplot(not_looking, 'enrolled_university', 'Not looking for jobs')

Proportion of employees looking for jobs is more for full time course.

We can also see that the proportion of employees not looking for jobs is more for no enrolment in an university.

Proportion of employees having part time course doesn't change much.

In [None]:
pieplot(looking_for_job, 'education_level', 'Looking for jobs')

In [None]:
pieplot(not_looking, 'education_level', 'Not looking for jobs')

Is company size and company type related. Can a Pvt. Ltd. company be as small as <10 employees

In [None]:
size_by_type = train_df.groupby(['company_size'])['company_type'].value_counts(normalize=True).unstack()
size_by_type = size_by_type.sort_values(by='Pvt Ltd', ascending=False)
size_by_type.plot(kind='bar', stacked=True)
plt.legend(loc=(1.04, 0))
plt.xticks(rotation=45, horizontalalignment='right', fontweight='light')
plt.title('Distribution of company size by company type')
plt.show()

Looks like they are not much related. Although we can see a few relations like startups do not have size more than 5000 employees

In [None]:
bar_plot(train_df, 'company_size', hue='target')

In [None]:
bar_plot(train_df, 'last_new_job', hue='target')

## Cleaning Data

Since enrolled_university, education_level, experience and last_new_job features have less than 3% missing values, we can drop these rows and still not lose much data.

In [None]:
train_df.dropna(subset=['enrolled_university', 'education_level', 'experience', 'last_new_job'], inplace=True)

In [None]:
train_df.isnull().mean()*100

In [None]:
X = train_df.drop('target', axis=1)
y = train_df['target']

Looks like data is missing at random. So let's use the mode to replace missing values

In [None]:
mode_imputer = SimpleImputer(strategy="most_frequent")
mode_imputer.fit(X)
X_imp = mode_imputer.transform(X)
X_pd = pd.DataFrame(X_imp, columns=X.columns, index=X.index)

Label encoding relevant_experience, education_level, company_size, experience and last_new_job features

In [None]:
rel_exp_idx, edu_idx, comp_size_idx = [list(X_pd.columns).index(col) for col in ['relevent_experience', 'education_level', 'company_size']]

In [None]:
def label_encode(X):
    X.iloc[:, rel_exp_idx] = X.iloc[:, rel_exp_idx].map({'No relevent experience': 0,
                          'Has relevent experience': 1}).astype(int)
    X.iloc[:, edu_idx] = X.iloc[:, edu_idx].map({'Primary School': 0,
                                                        'High School': 1,
                                                        'Graduate': 2,
                                                        'Masters': 3,
                                                        'Phd': 4}).astype(int)
    X.iloc[:, comp_size_idx] = X.iloc[:, comp_size_idx].map({'<10': 0,
                                                             '10/49': 1,
                                                             '50-99': 2,
                                                             '100-500': 3,
                                                             '500-999': 4,
                                                             '1000-4999': 5,
                                                             '5000-9999': 6,
                                                             '10000+': 7}).astype(int)
    X.loc[(X['experience'] == '>20'), 'experience'] = 21
    X.loc[(X['experience'] == '<1'), 'experience'] = 0
    X.loc[(X['last_new_job'] == 'never'), 'last_new_job'] = 0
    X.loc[(X['last_new_job'] == '>4'), 'last_new_job'] = 5
    return X

my_encoder = FunctionTransformer(label_encode)
encoded = my_encoder.fit_transform(X_pd)

In [None]:
X_pd['experience'] = X_pd['experience'].astype(int)
X_pd['last_new_job'] = X_pd['last_new_job'].astype(int)
X_pd['city_development_index'] = X_pd['city_development_index'].astype(float)
X_pd['training_hours'] = X_pd['training_hours'].astype(int)

City has many unique values and city development index has the information for city. So we can drop city feature.

In [None]:
X_pd.drop(['city'], axis=1, inplace=True)

In [None]:
X_pd.info()

Creating pipeline for transformation

In [None]:
num_attribs = ['city_development_index', 'training_hours']
cat_attribs = ['gender', 'enrolled_university', 'major_discipline', 'company_type']

In [None]:
preprocessing = ColumnTransformer([
    ("num", StandardScaler(), num_attribs),
    ("cat", OneHotEncoder(drop='first', sparse=False), cat_attribs)
], remainder="passthrough")

hr_processed = preprocessing.fit_transform(X_pd)
hr_processed.shape

In [None]:
col_names = []
for transformer_tuple in preprocessing.transformers_[:-1]:
    cols = transformer_tuple[2]
    transformer = transformer_tuple[1]
    try:
        cols = transformer.get_feature_names(cols)
    except AttributeError:
        cols = cols
        
    col_names += list(cols)
    
col_names += list(X_pd.columns[preprocessing.transformers_[2][2]])

Retreiving column names after transformation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(hr_processed, y, test_size=0.2, shuffle=True, stratify=y)

## Training models

In [None]:
def evaluate_model(model, X, y):
    
    clf = Pipeline([
        ('passthrough', SMOTE()),
        ('model', model)
    ])
    
    clf.fit(X, y)
    
    auc = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
    print("AUC score of " + model.__class__.__name__ + " = " + str(auc.mean()))
    
    return clf
    

In [None]:
clf_Log_reg = evaluate_model(LogisticRegression(max_iter=1000), X_train, y_train)
clf_SGD = evaluate_model(SGDClassifier(early_stopping=True, warm_start=True), X_train, y_train)
clf_KNN = evaluate_model(KNeighborsClassifier(), X_train, y_train)
clf_RF = evaluate_model(RandomForestClassifier(), X_train, y_train)
CLF_ADA = evaluate_model(AdaBoostClassifier(n_estimators=200), X_train, y_train)
CLF_XGB = evaluate_model(XGBClassifier(), X_train, y_train)

In [None]:
ax = plt.gca()
plot_roc_curve(clf_Log_reg, X_test, y_test, ax=ax, alpha=0.8, name="Logistic Regression")
plot_roc_curve(clf_SGD, X_test, y_test, ax=ax, alpha=0.8, name="SGD Classifier")
plot_roc_curve(clf_KNN, X_test, y_test, ax=ax, alpha=0.8, name="KNN Classifier")
plot_roc_curve(clf_RF, X_test, y_test, ax=ax, alpha=0.8, name="Random Forest Classifier")
plot_roc_curve(CLF_ADA, X_test, y_test, ax=ax, alpha=0.8, name="AdaBoost Classifier")
plot_roc_curve(CLF_XGB, X_test, y_test, ax=ax, alpha=0.8, name="XGBoost Classifier")
plt.show()

Looks like XGBoost Classifier is performing on good on both our training set and test set.

## Conclusion and model inference

In [None]:
importances = permutation_importance(CLF_XGB, X_test, y_test, n_repeats = 30)

In [None]:
importances_df = pd.DataFrame(importances.importances_mean)
importances_df['Features'] = col_names
importances_df = importances_df.rename(columns={0 : 'Average Importance'})

fig = px.bar(importances_df, x='Features', y='Average Importance')
fig.show()

### To conclude

* As we visualized city developement plays the most important role is determining whether an employee is looking for jobs.
* Looks like our model is not accurate and hence I will think twice before using it in actual hr department. Yes we can do hyperparameter tuning and increase the accuracy of model but at best this model can be used to validate the findings of hr department than to replace them.
* Also looking at the model inference tells us that we can still do some feature selection to improve performance of our model.
* On a lighter note, feels good to see that gender has least role in determining whether employee can look for other jobs. Seems like everyone has equal opportunities these days.

## Please do upvote if you like this notebook, it encourages me to share more notebooks and suggestions are always welcomed.