In [None]:

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



## Read Data

In [None]:
df=pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
df.head()

## First look and Some Data Controls

In [None]:
df.info()

In [None]:
df.set_index('enrollee_id',inplace=True)
df.head()

In [None]:
df.describe(include="all")

## Missing Data Control

In [None]:
round(df.isnull().sum()/df.shape[0]*100,2)

In [None]:
df2=df.dropna(subset=["enrolled_university","education_level","experience","last_new_job"])
df2.head()

Dropped rows which have lower then % 5 missing ratio.

## Exploratory Data Analysis

In [None]:
plt.figure(figsize=[8,6])
ax = sns.countplot(x="target", data=df2)

Target value is imbalance  so i will use oversampling method (SMOTE)

## City

In [None]:
df_city_freq= round(df2[df2["target"]==1][["city","target"]].groupby
 ("city").count()/df2[df2["target"]==1].shape[0]*100,2).rename(columns={"target":"city_freq"}).reset_index()

df3=df2.reset_index().merge(df_city_freq, how='left', on='city').set_index('enrollee_id')
df3["city_freq"].fillna(0,inplace=True)


df3["ordinal_city"]=pd.qcut(df3["city_freq"],3)

enc=preprocessing.LabelEncoder()
df3["ordinal_city"]=enc.fit_transform(df3["ordinal_city"])

plt.figure(figsize=[8,6])
ax=sns.countplot("ordinal_city",data=df3,hue="target")

City is divided to 3 groups that depends on the job chaingin ratio.

## city_development_index

In [None]:
plt.figure(figsize=[8,6])
ax = sns.kdeplot(df3["city_development_index"][(df3["target"] == 1) ],color="Red")
ax = sns.kdeplot(df3["city_development_index"][(df3["target"] == 0) ],color="Blue")
ax.legend(["Looking for a job change","Not looking for job change"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel("city_development_index")
ax.set_title("Distribution of "+ "city_development_index" +" by Job change");

In [None]:
plt.figure(figsize=[8,6])
sns.boxplot(x="target",y="city_development_index",data=df3);

In [None]:
df3["ordinal_development_index"]=pd.qcut(df3["city_development_index"],4)

enc=preprocessing.LabelEncoder()
df3["ordinal_development_index"]=enc.fit_transform(df3["ordinal_development_index"])

plt.figure(figsize=[8,6])
ax=sns.countplot("ordinal_development_index",data=df3,hue="target")

city_development_index is divided 4 groups. Data scientists living in low city develoment index tend to change their job more often.

## gender

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("gender",data=df3,hue="target")

In [None]:
df4=df3[df3["gender"]!="Other"]
df4.head()

In [None]:
df4.gender.fillna("Male",inplace=True)

Dropped Other gender. Filled missing values with most frequent value.

## relevent_experience

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("relevent_experience",data=df4,hue="target")

## enrolled_university

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("enrolled_university",data=df4,hue="target")

## education_level

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("education_level",data=df4,hue="target")

In [None]:
df_edu_cat=pd.DataFrame(np.where((df4.loc[:,["education_level"]]=="Primary School"),0
                                 
                                 ,np.where((df4.loc[:,["education_level"]]=="High School"),1
                                           
                                           ,np.where((df4.loc[:,["education_level"]]=="Graduate"),2
                                                     
                                                    ,np.where((df4.loc[:,["education_level"]]=="Masters"),3
                                                              
                                                              ,4)))),columns=["ordinal_education_level"])


df5=df4.reset_index().merge(df_edu_cat, how='left',left_index=True, right_index=True).set_index('enrollee_id')

plt.figure(figsize=[8,6])
ax=sns.countplot("ordinal_education_level",data=df5,hue="target")

Converted to ordinal values.

## major_discipline

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("major_discipline",data=df5,hue="target")
ax.set_xticklabels(ax.get_xticklabels(),rotation=20);

In [None]:
df5.major_discipline.fillna("STEM",inplace=True)

Filled missing values with most frequent value.

## experience

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("experience",data=df5,hue="target")
ax.set_xticklabels(ax.get_xticklabels(),rotation=20);

In [None]:
df_experience_cat=pd.DataFrame(np.where((df5.loc[:,["experience"]].isin(["<1","1","2","3","4","5","6","7"])),0
                                        ,np.where((df5.loc[:,["experience"]].isin(["8","9","10","11"])),1
                                                  ,np.where((df5.loc[:,["experience"]].isin(["12","13","14","15"])),2
                                                            ,np.where((df5.loc[:,["experience"]].isin(["16","17","18","19","20"])),3
                                                                      ,4)))),columns=["ordinal_experience_cat"])


df6=df5.reset_index().merge(df_experience_cat, how='left',left_index=True, right_index=True).set_index('enrollee_id')


plt.figure(figsize=[8,6])
ax=sns.countplot("ordinal_experience_cat",data=df6,hue="target")

experience is divided 5 groups. Data scientist with low experience tend to change their job more often.

## company_size

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("company_size",data=df6,hue="target")
ax.set_xticklabels(ax.get_xticklabels(),rotation=20);

In [None]:
df6.company_size.fillna("50-99",inplace=True)

Filled missing values with most frequent value.

## company_size

In [None]:
df_company_cat=pd.DataFrame(np.where((df6.loc[:,["company_size"]]=="<10"),0
                                     ,np.where((df6.loc[:,["company_size"]]=="10/49"),1
                                           ,np.where((df6.loc[:,["company_size"]]=="50-99"),2
                                                     ,np.where((df6.loc[:,["company_size"]]=="100-500"),3
                                                              ,np.where((df6.loc[:,["company_size"]]=="500-999"),4
                                                                       ,np.where((df6.loc[:,["company_size"]]=="1000-4999"),5
                                                                                ,np.where((df6.loc[:,["company_size"]]=="5000-9999"),6
                                                                                         ,7))))))),columns=["ordinal_company_size"])


df7=df6.reset_index().merge(df_company_cat, how='left',left_index=True, right_index=True).set_index('enrollee_id')

plt.figure(figsize=[8,6])
ax=sns.countplot("ordinal_company_size",data=df7,hue="target")

Converted to ordinal values.

## company_type

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("company_type",data=df7,hue="target")
ax.set_xticklabels(ax.get_xticklabels(),rotation=25);

In [None]:
df7.company_type.fillna("Pvt Ltd",inplace=True) 

Filled missing values with most frequent value.

## training_hours

In [None]:
plt.figure(figsize=[8,6])
ax = sns.kdeplot(df7["training_hours"][(df7["target"] == 1) ],color="Red")
ax = sns.kdeplot(df7["training_hours"][(df7["target"] == 0) ],color="Blue")
ax.legend(["Looking for a job change","Not looking for job change"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel("training_hours")
ax.set_title("Distribution of "+ "training_hours" +" by Job change");

In [None]:
plt.figure(figsize=[8,6])
sns.boxplot(x="target",y="training_hours",data=df7);

it looks like that different training hours have no effect on the target. I'll drop this feature later.

## last_new_job

In [None]:
plt.figure(figsize=[8,6])
ax=sns.countplot("last_new_job",data=df7,hue="target")
ax.set_xticklabels(ax.get_xticklabels(),rotation=25);

In [None]:
df_lastnew_cat=pd.DataFrame(np.where((df7.loc[:,["last_new_job"]]=="never"),0
                                     
                                     ,np.where((df7.loc[:,["last_new_job"]]=="1"),1
                                           
                                           ,np.where((df7.loc[:,["last_new_job"]]=="2"),2
                                                     
                                                    ,np.where((df7.loc[:,["last_new_job"]]=="3"),3
                                                              
                                                              ,np.where((df7.loc[:,["last_new_job"]]=="4"),4
                                                                        
                                                                        ,5))))),columns=["ordinal_last_new_job"])


df8=df7.reset_index().merge(df_lastnew_cat, how='left',left_index=True, right_index=True).set_index('enrollee_id')

plt.figure(figsize=[8,6])
ax=sns.countplot("ordinal_last_new_job",data=df8,hue="target")

Converted to ordinal values.

In [None]:
df8.head()

In [None]:
df8.drop(["city","city_development_index","education_level","experience","last_new_job","company_size"
          ,"training_hours","city_freq"],axis=1,inplace=True)

Dropped some features that i have already used for feature engineering.

In [None]:
df8.info()

## Correlation Heatmap

In [None]:
df9= pd.get_dummies(df8)

fig, ax = plt.subplots(figsize=(12,12))
fig.suptitle('Correlation between Target and features',fontsize=20)
ax=sns.heatmap(df9.corr()[["target"]].sort_values("target"),vmax=1, vmin=-1, cmap="YlGnBu", annot=True, ax=ax);
ax.invert_yaxis()

## XGBClassifier with Parameter Tuning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from imblearn.pipeline import Pipeline as imbPipe
from imblearn.over_sampling import SMOTE
from xgboost import  XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
X=df9.drop("target",axis=1)
y=df9["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True, stratify = y)

In [None]:
XGBoost_pipe = imbPipe([
    ("smote", SMOTE(random_state=42,n_jobs=-1)),
    ("XGBoost", XGBClassifier(random_state=42,n_jobs=-1,tree_method="hist"))
])

params={
    "XGBoost__max_depth": [20,21],
    "XGBoost__min_child_weight":[22,23],
    "XGBoost__n_estimators":[25,27],
    "XGBoost__subsample":[0.4,0.5,0.6],
    "XGBoost__colsample_bytree":[0.4,0.5,0.6],
    "XGBoost__gamma":[1,2,3],
    
}

XGBoost_grid = GridSearchCV(XGBoost_pipe, params, cv=3,n_jobs=-1,scoring="roc_auc")
XGBoost_grid.fit(X_train, y_train)
print("Best Parameters for Model:  ",XGBoost_grid.best_params_)
y_pred=XGBoost_grid.predict(X_train)
print("\n")
print(classification_report(y_train, y_pred))

In [None]:
y_pred=XGBoost_grid.predict(X_test)  
print(classification_report(y_test, y_pred))

In [None]:
fpr, tpr, thresholds =roc_curve(y_test, XGBoost_grid.predict_proba(X_test)[:,1], pos_label=1)
roc_auc=roc_auc_score(y_test, XGBoost_grid.predict_proba(X_test)[:,1])
plt.figure( figsize=(14,6))
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()