In [None]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

from xgboost import XGBClassifier, cv
from sklearn.model_selection import GridSearchCV

In [None]:
data = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
data.head(10)

In [None]:
print("<<<<<<<<<<          Data info          >>>>>>>>>>")
print("\n", data.info(), "\n")
print("<<<<<<<<<<          Data describe          >>>>>>>>>>")
print("\n", data.describe(), "\n")
print("<<<<<<<<<<          Count null          >>>>>>>>>>")
print("\n", data.isnull().sum(axis=0), "\n")

In [None]:
data.drop(['gender','major_discipline','company_size','company_type'], axis='columns',inplace=True)

In [None]:
px.histogram(data, x='city', color='target')

In [None]:
px.histogram(data, x='experience', color='target')

In [None]:
px.histogram(data, x='relevent_experience', color='target')

In [None]:
px.histogram(data, x='education_level', color='target')

In [None]:
data = data.dropna(subset=['experience']).reset_index(drop=True)
data['experience'] = data['experience'].astype(str)

for i in range(data.shape[0]):
    data.loc[i,'experience'] = data.loc[i,'experience'].replace('>','').replace('<','')

data['experience'] = data['experience'].astype(int)

data.loc[data['experience'] < 10, 'experience_level'] = 'Low'
data.loc[(10 < data['experience']) & (data['experience'] < 20), 'experience_level'] = 'Mid'
data.loc[data['experience'] == 20, 'experience_level'] = 'High'

In [None]:
data['last_new_job'] = data['last_new_job'].astype(str)

for i in range(data.shape[0]):
    data.loc[i,'last_new_job'] = data.loc[i,'last_new_job'].replace('>','').replace('<','').replace('never','0').replace('nan','0')

data['last_new_job'] = data['last_new_job'].astype(int)

In [None]:
data['enrolled_university'] = data['enrolled_university'].astype(str)
for i in range(data.shape[0]):
    data.loc[i,'enrolled_university'] = data.loc[i,'enrolled_university'].replace('nan','no_enrollment')

data['enrolled_university'] = data['enrolled_university'].astype('category')

In [None]:
data['education_level'] = data['education_level'].astype(str)
for i in range(data.shape[0]):
    data.loc[i,'education_level'] = data.loc[i,'education_level'].replace('nan','Graduate')

data['enrolled_university'] = data['enrolled_university'].astype('category')

In [None]:
data['city'] = data['city'].astype('category')
data['relevent_experience'] = data['relevent_experience'].astype('category')
data['enrolled_university'] = data['enrolled_university'].astype('category')
data['education_level'] = data['education_level'].astype('category')
data['target'] = data['target'].astype('category')

In [None]:
data.head()

In [None]:
relevent_dummies = pd.get_dummies(data['relevent_experience'])
enrolled_dummies = pd.get_dummies(data['enrolled_university'])
education_dummies = pd.get_dummies(data['education_level'])

In [None]:
enrolled_dummies

In [None]:
train_data = pd.concat([data[['city_development_index', 'experience', 'last_new_job','training_hours']],
                        relevent_dummies, enrolled_dummies, education_dummies, data['target']], axis=1)

In [None]:
train_data

In [None]:
x_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]

In [None]:
xgb = XGBClassifier()
xgb_param_grid = {'max_depth': [10,20,30], 'n_estimators': [10,20,30], 'learning_rate': [0.1,0.2,0.3],
                 'use_label_encoder' : [False], 'objective' : ["binary:hinge"]}

hr_grid = GridSearchCV(estimator=xgb,
                       param_grid=xgb_param_grid,
                       scoring='roc_auc',
                       n_jobs=8,
                       cv=5,
                       refit=True, 
                       return_train_score=True)

hr_grid.fit(x_train, y_train)

In [None]:
hr_grid_df = pd.DataFrame(hr_grid.cv_results_)
hr_grid_df

In [None]:
hr_grid_df[hr_grid_df['rank_test_score']==1]