## HR Analytics: Job Change of Data Scientists

## Importing and  Data Analysis

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df= pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
for i in df.columns:
    print('Uniques values in {} are: {}\n'.format(i, df[i].unique()))
    print((df[i].value_counts(normalize= True).head(10)*100).round(2))
    print('-'*100)

In [None]:
# Missing Values

df.gender= df.gender.fillna('Unknow')
df.enrolled_university= df.enrolled_university.fillna('no_enrollment')
df.education_level= df.education_level.fillna('Graduate')
df.major_discipline= df.major_discipline.fillna('STEM')
df.experience= df.experience.fillna('20')
df.company_size= df.company_size.fillna('50-99')
df.company_type= df.company_type.fillna('Pvt Ltd')
df.last_new_job= df.last_new_job.fillna('1')

In [None]:
df.isna().sum()

## Data Visualization

In [None]:
sns.set_style('darkgrid')
sns.set_palette('YlOrRd')

plt.figure(figsize= (12,7))
df_target= df.target.value_counts()
plt.pie(df_target, labels= df_target.index, autopct= '%.1f%%', startangle= 90, explode= [0.05, 0.05], shadow = True )
plt.legend(['Not looking for job change', 'Looking for a job change'])
plt.title('Target', fontsize= 16)

plt.show()

In [None]:
lista= ['gender', 'relevent_experience', 'enrolled_university', 'education_level', 
       'major_discipline', 'experience', 'company_size','company_type','last_new_job','company_type']

plt.figure(figsize= (20,35))

n=1
for i in lista:
    plt.subplot(5,2,n)
    sns.countplot(x= i, hue= 'target', data= df)
    plt.title("Countplot of {}  by {}".format( i, 'target'), fontsize= 16)
    plt.legend(['Not looking for job change', 'Looking for a job change'])
    n +=1


In [None]:
plt.figure(figsize= (15, 15))

plt.subplot(2,2,1)
df_gender= df.gender.value_counts()
plt.pie(df_gender, labels= df_gender.index, autopct= '%.1f%%', startangle= 90, 
        explode= [0.05, 0.05, 0.05, 0.1], shadow = True )
plt.title('Gender', fontsize= 16)

plt.subplot(2,2,2)
df_relevent_experience= df.relevent_experience.value_counts()
plt.pie(df_relevent_experience, labels= df_relevent_experience.index, autopct= '%.1f%%', startangle= 90,
        explode= [0.05, 0.05], shadow = True )
plt.title('Relevent Experience', fontsize= 16)

plt.subplot(2,2,3)
df_education_level= df.education_level.value_counts()
plt.pie(df_education_level, labels= df_education_level.index, autopct= '%.1f%%', startangle= 90, 
        explode= [0.05, 0.05, 0.05, 0.05, 0.2], shadow = True )
plt.title('Education Level', fontsize= 16)

plt.subplot(2,2,4)
df_company_type= df.company_type.value_counts()
plt.pie(df_company_type, labels= df_company_type.index, autopct= '%.1f%%', startangle= 90, 
        explode= [0.05, 0.05, 0.05, 0.05, 0.2, 0.3], shadow = True )
plt.title('Company Type', fontsize= 16)

plt.show()

In [None]:
plt.figure(figsize= (15,5))
sns.histplot(x= df.training_hours, kde= True, color= 'r')
plt.title('Training Hours', fontsize= 16)
plt.show()

In [None]:
plt.figure(figsize= (15,5))
sns.histplot(x= df.city_development_index, kde= True, color= 'r', bins= 60)
plt.title('City Development Index', fontsize= 16)
plt.show()

## Transforming Data

In [None]:
def gender_to_numeric(x):
    if x=='Unknow': return 3
    if x=='Female': return 2
    if x=='Male':   return 1
    if x=='Other':   return 0
    
def rel_experience(x):
    if x=='Has relevent experience': return 1
    if x=='No relevent experience':   return 0
    
def enrollment(x):
    if x=='no_enrollment'   : return 0
    if x=='Full time course':   return 1 
    if x=='Part time course':   return 2 
    
def edu_level(x):
    if x=='Graduate'       :   return 0
    if x=='Masters'        :   return 1 
    if x=='High School'    :   return 2 
    if x=='Phd'            :   return 3 
    if x=='Primary School' :   return 4 
    
def major(x):
    if x=='STEM'                   :   return 0
    if x=='Business Degree'        :   return 1 
    if x=='Arts'                   :   return 2 
    if x=='Humanities'             :   return 3 
    if x=='No Major'               :   return 4 
    if x=='Other'                  :   return 5 
    
def experience(x):
    if x=='<1'      :   return 0
    if x=='1'       :   return 1 
    if x=='2'       :   return 2 
    if x=='3'       :   return 3 
    if x=='4'       :   return 4 
    if x=='5'       :   return 5
    if x=='6'       :   return 6
    if x=='7'       :   return 7
    if x=='8'       :   return 8 
    if x=='9'       :   return 9 
    if x=='10'      :   return 10 
    if x=='11'      :   return 11
    if x=='12'      :   return 12
    if x=='13'      :   return 13 
    if x=='14'      :   return 14 
    if x=='15'      :   return 15 
    if x=='16'      :   return 16
    if x=='17'      :   return 17
    if x=='18'      :   return 18
    if x=='19'      :   return 19 
    if x=='20'      :   return 20 
    if x=='>20'     :   return 21 
    
def company_t(x):
    if x=='Pvt Ltd'               :   return 0
    if x=='Funded Startup'        :   return 1 
    if x=='Early Stage Startup'   :   return 2 
    if x=='Other'                 :   return 3 
    if x=='Public Sector'         :   return 4 
    if x=='NGO'                   :   return 5 
    
def company_s(x):
    if x=='<10'          :   return 0
    if x=='10/49'        :   return 1 
    if x=='100-500'      :   return 2 
    if x=='1000-4999'    :   return 3 
    if x=='10000+'       :   return 4 
    if x=='50-99'        :   return 5 
    if x=='500-999'      :   return 6 
    if x=='5000-9999'    :   return 7
    
def last_job(x):
    if x=='never'        :   return 0
    if x=='1'            :   return 1 
    if x=='2'            :   return 2 
    if x=='3'            :   return 3 
    if x=='4'            :   return 4 
    if x=='>4'           :   return 5 
    
def city(x):
    if x=='city_103'         : return  0
    if x=='city_40'          : return  1
    if x=='city_21'          : return  2
    if x=='city_115'         : return  3
    if x=='city_162'         : return  4
    if x=='city_176'         : return  5
    if x=='city_160'         : return  6
    if x=='city_46'          : return  7
    if x=='city_61'          : return  8
    if x=='city_114'         : return  9
    if x=='city_13'          : return  10
    if x=='city_159'         : return  11
    if x=='city_102'         : return  12
    if x=='city_67'          : return  13
    if x=='city_100'         : return  14
    if x=='city_16'          : return  15
    if x=='city_71'          : return  16
    if x=='city_104'         : return  17
    if x=='city_64'          : return  18
    if x=='city_101'         : return  19
    if x=='city_83'          : return  20
    if x=='city_105'         : return  21
    if x=='city_73'          : return  22
    if x=='city_75'          : return  23
    if x=='city_41'          : return  24
    if x=='city_11'          : return  25
    if x=='city_93'          : return  26
    if x=='city_90'          : return  27
    if x=='city_36'          : return  28
    if x=='city_20'          : return  29
    if x=='city_57'          : return  30
    if x=='city_152'         : return  31
    if x=='city_19'          : return  32
    if x=='city_65'          : return  33
    if x=='city_74'          : return  34
    if x=='city_173'         : return  35
    if x=='city_136'         : return  36
    if x=='city_98'          : return  37
    if x=='city_97'          : return  38
    if x=='city_50'          : return  39
    if x=='city_138'         : return  40
    if x=='city_82'          : return  41
    if x=='city_157'         : return  42
    if x=='city_89'          : return  43
    if x=='city_150'         : return  44
    if x=='city_70'          : return  45
    if x=='city_175'         : return  46
    if x=='city_94'          : return  47
    if x=='city_28'          : return  48
    if x=='city_59'          : return  49
    if x=='city_165'         : return  50
    if x=='city_145'         : return  51
    if x=='city_142'         : return  52
    if x=='city_26'          : return  53
    if x=='city_12'          : return  54
    if x=='city_37'          : return  55
    if x=='city_43'          : return  56
    if x=='city_116'         : return  57
    if x=='city_23'          : return  58
    if x=='city_99'          : return  59
    if x=='city_149'         : return  60
    if x=='city_10'          : return  61
    if x=='city_45'          : return  62
    if x=='city_80'          : return  63
    if x=='city_128'         : return  64
    if x=='city_158'         : return  65
    if x=='city_123'         : return  66
    if x=='city_7'           : return  67
    if x=='city_72'          : return  68
    if x=='city_106'         : return  69
    if x=='city_143'         : return  70
    if x=='city_78'          : return  71
    if x=='city_109'         : return  72
    if x=='city_24'          : return  73
    if x=='city_134'         : return  74
    if x=='city_48'          : return  75
    if x=='city_144'         : return  76
    if x=='city_91'          : return  77
    if x=='city_146'         : return  78
    if x=='city_133'         : return  79
    if x=='city_126'         : return  80
    if x=='city_118'         : return  81
    if x=='city_9'           : return  82
    if x=='city_167'         : return  83
    if x=='city_27'          : return  84
    if x=='city_84'          : return  85
    if x=='city_54'          : return  86
    if x=='city_39'          : return  87
    if x=='city_79'          : return  88
    if x=='city_76'          : return  89
    if x=='city_77'          : return  90
    if x=='city_81'          : return  91
    if x=='city_131'         : return  92
    if x=='city_44'          : return  93
    if x=='city_117'         : return  94
    if x=='city_155'         : return  95
    if x=='city_33'          : return  96
    if x=='city_141'         : return  97
    if x=='city_127'         : return  98
    if x=='city_62'          : return  99
    if x=='city_53'          : return  100
    if x=='city_25'          : return  101
    if x=='city_2'           : return  102
    if x=='city_69'          : return  103
    if x=='city_120'         : return  104
    if x=='city_111'         : return  105
    if x=='city_30'          : return  106
    if x=='city_1'           : return  107
    if x=='city_140'         : return  108
    if x=='city_179'         : return  109
    if x=='city_55'          : return  110
    if x=='city_14'          : return  111
    if x=='city_42'          : return  112
    if x=='city_107'         : return  113
    if x=='city_18'          : return  114
    if x=='city_139'         : return  115
    if x=='city_180'         : return  116
    if x=='city_166'         : return  117
    if x=='city_121'         : return  118
    if x=='city_129'         : return  119
    if x=='city_8'           : return  120
    if x=='city_31'          : return  121
    if x=='city_171'         : return  122

In [None]:
df['gender'] = df['gender'].apply(gender_to_numeric)
df['relevent_experience'] = df['relevent_experience'].apply(rel_experience)
df['enrolled_university'] = df['enrolled_university'].apply(enrollment)
df['education_level'] = df['education_level'].apply(edu_level)
df['major_discipline'] = df['major_discipline'].apply(major)
df['experience'] = df['experience'].apply(experience)
df['company_type'] = df['company_type'].apply(company_t)
df['company_size'] = df['company_size'].apply(company_s)
df['last_new_job'] = df['last_new_job'].apply(last_job)
df['city'] = df['city'].apply(city)

In [None]:
df.head()

In [None]:
for i in df.columns:
    print('Unique values {} are {}\n'.format(i, df[i].unique()))
    print('-'*100)

In [None]:
df.dtypes

In [None]:
corr= df.corr()

plt.figure(figsize= (12,12))
sns.heatmap(corr, annot= True, linewidths= 1, cmap= 'YlOrRd')
plt.title('Correlation Matrix', fontsize= 16)

plt.show()

## Machine Learning

### Train Data

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
seed= 123
np.random.seed(seed)

x= df.drop(columns= 'target')
y= df.target

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.3, stratify= y)

In [None]:
pipelines= []

pipelines.append(('AdaBoost', Pipeline([('Scaler', StandardScaler()),('AdaBoost', AdaBoostClassifier())])))
pipelines.append(('RandomForest', Pipeline([('Scaler', StandardScaler()),('RandomForest', RandomForestClassifier(max_depth= 3))])))
pipelines.append(('DecisionTree', Pipeline([('Scaler', StandardScaler()),('DecisionTree', DecisionTreeClassifier(max_depth= 3))])))
pipelines.append(('Logistic', Pipeline([('Scaler', StandardScaler()),('Logistic', LogisticRegression())])))
pipelines.append(('KNeighbors', Pipeline([('Scaler', StandardScaler()),('KNeighbors', KNeighborsClassifier())])))
pipelines

In [None]:
np.random.seed(seed)

results= []
names= []

for name, model in pipelines:
    kfold= KFold(n_splits= 10, shuffle= True)
    cv_results= cross_val_score(estimator= model, X= x_train, y= y_train, scoring= 'accuracy', cv= kfold)*100
    results.append(cv_results)
    names.append(name)
    mean= cv_results.mean()
    std= cv_results.std()
    print('{:15} : Average Accuracy {:.2f}% - Std {:.2f}%'.format(
        name, mean, std))

In [None]:
plt.figure(figsize= (10,5))
plt.boxplot(results, labels= names)
plt.title('Models Comparison', fontsize= 16)

In [None]:
np.random.seed(seed)

criterion_values= ['gini', 'entropy']
splitter_values= ['best', 'random']
max_depth_values= [2,3,4,5,6]
params_values= dict(criterion= criterion_values, splitter= splitter_values, max_depth= max_depth_values)

model= DecisionTreeClassifier()
kfold= KFold(n_splits= 10, shuffle= True)
grid= GridSearchCV(estimator= model, param_grid= params_values, scoring= 'accuracy', cv= kfold)
grid_results= grid.fit(x_train, y_train)
print('BEST: Accuracy {:.2f}% - Params {}'.format(grid_results.best_score_*100, grid_results.best_params_))

means= grid_results.cv_results_['mean_test_score']
stds= grid_results.cv_results_['std_test_score']
params= grid_results.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print('Average Accuracy {:.2f}% - Std {:.2f}% - Params {}'.format(mean*100, std*100, param))


In [None]:
np.random.seed(seed)

model= DecisionTreeClassifier(criterion= 'entropy', max_depth= 3, splitter= 'best')
model.fit(x_train, y_train)
y_pred= model.predict(x_test)

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))

print("Accuracy: {:.2f}%\n".format(accuracy_score(y_test, y_pred)*100))

# Confusion Matrix 
plt.figure(figsize= (20,7))

# Normal
plt.subplot(1,2,1)
cm= confusion_matrix(y_test, y_pred)
labels= ['Not looking for job change', 'Looking for a job change']
sns.heatmap(cm, annot= True, linewidths= 2, fmt= 'd')
plt.title('Confusion Matrix - Normal', fontsize= 14)


# Percent
plt.subplot(1,2,2)
cmp= confusion_matrix(y_test, y_pred, normalize= 'true')
xlabels= ['Predicted Not looking for job change', 'Predicted Looking for a job change']
ylabels= ['Not looking for job change', 'Looking for a job change']
sns.heatmap(cmp, annot= True, linewidths= 2, fmt= '.2g', xticklabels= xlabels, yticklabels= ylabels)
plt.title('Confusion Matrix - Percent', fontsize= 14)

plt.show()