# **HR Analytics : Predict probabilty of Data Scientist leaving the Job and understand feature impacting this**

In [None]:
#Importing required packges
import pandas as pd
import numpy as np

In [None]:
#Installing pycaret library for 
!pip install pycaret
import pycaret as pc

In [None]:
#Checking version of PyCaret
print('PyCaret: %s' % pc.__version__)

In [None]:
#Reading the Dataset
df_train=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

In [None]:
df_train.shape

In [None]:
#Checking for top 10 rows
df_train.head()

In [None]:
#Checking for column info
df_train.info()

# **Visualization to Understand the data**

**Female Data Scientists have higher chances of leaving a Job compared to other Genders**

In [None]:
import plotly.express as px
df_gender = df_train.groupby(['target', 'gender']).size().reset_index()
df_gender['percentage'] = df_train.groupby(['target', 'gender']).size().groupby(level=1).apply(lambda x: 100 * x / float(x.sum())).values
df_gender.columns = ['target', 'gender', 'Counts', 'Percentage']
df_gender
px.bar(df_gender, x='gender', y=['Counts'], color='target', text=df_gender['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)))

**Data Scientist with no relevant experience finds it difficult to continue in the Job**

In [None]:
import plotly.express as px
df_exp = df_train.groupby(['target', 'relevent_experience']).size().reset_index()
df_exp['percentage'] = df_train.groupby(['target', 'relevent_experience']).size().groupby(level=1).apply(lambda x: 100 * x / float(x.sum())).values
df_exp.columns = ['target', 'relevent_experience', 'Counts', 'Percentage']
df_exp
px.bar(df_exp, x='relevent_experience', y=['Counts'], color='target', text=df_exp['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)))

In [None]:
#Checking Count of Nulls across columns
df_train.isna().sum()

**Missing Values Imputation**

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

In [None]:
#Imputing Null values with mean value
df_train['gender'].fillna(value=df_train['gender'].mode(dropna=True)[0],inplace=True)
df_train['enrolled_university'].fillna(value=df_train['enrolled_university'].mode(dropna=True)[0],inplace=True)
df_train['education_level'].fillna(value=df_train['education_level'].mode(dropna=True)[0],inplace=True)
df_train['major_discipline'].fillna(value=df_train['major_discipline'].mode(dropna=True)[0],inplace=True)
df_train['experience'].fillna(value=df_train['experience'].mode(dropna=True)[0],inplace=True)
df_train['company_size'].fillna(value=df_train['company_size'].mode(dropna=True)[0],inplace=True)
df_train['company_type'].fillna(value=df_train['company_type'].mode(dropna=True)[0],inplace=True)
df_train['last_new_job'].fillna(value=df_train['last_new_job'].mode(dropna=True)[0],inplace=True)

In [None]:
#Checking Count of Nulls across columns
df_train.isna().sum()

In [None]:
#Building PyCaret model with defualt hyperparameters
from pycaret.classification import *
pycar = setup(df_train, target = 'target',silent=True)#,train_size = 0.6,categorical_features = ['gender'],
#ignore_features = ['company_size'],ordinal_features = {'experience' : ['low', 'medium', 'high']},high_cardinality_features = ['city'],normalize = True,normalize_method = 'minmax',
#transformation = True,feature_interaction = True, feature_ratio = True,polynomial_features = True)

In [None]:
#Comparing different model performance
compare_models()

**We can see lightgbm has best performance hence choosing this model**

In [None]:
gbc=create_model('gbc')

In [None]:
#Plotting model Output
plot_model(gbc, plot = 'boundary')

In [None]:
#Calibrate the model
calib_lgbm = calibrate_model(gbc)

In [None]:
#Evaluate Model
evaluate_model(gbc)

**Preparing the model output on test population**

In [None]:
# read the test data
test_data_classification = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
# make predictions
predictions = predict_model(gbc, data=test_data_classification)
# view the predictions
final_sub=predictions[['enrollee_id','Score']]
final_sub.rename(columns={'enrollee_id': 'enrollee_id', 'Score': 'target'}, inplace=True)
final_sub.to_csv('final_submission.csv')

**Interpreting the Impact of features on Output probability**

In [None]:
#interpret_model(gbc)

In [None]:
gbc_holdout_pred = predict_model(gbc)

In [None]:
#Credits: https://towardsdatascience.com/machine-learning-made-easier-with-pycaret-907e7124efe6