<img src = "https://media.giphy.com/media/Q6l0us2sAAAiL9RMnP/giphy.gif">

In [None]:
!pip install altair vega_datasets -q
!pip install dexplot -q
!pip install pycomp -q
!pip install datasist -q


## Step I- Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import altair as alt
import dexplot as dxp
from collections import Counter
import datasist as ds
import matplotlib.pyplot as plt
plt.xkcd
from pycomp.viz.insights import *
alt.data_transformers.disable_max_rows()
%matplotlib inline


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,RobustScaler


import xgboost as xgb
import lightgbm as lgb


In [None]:
train_df = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test_df = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

In [None]:
train_df.tail()

In [None]:
train_df = ds.feature_engineering.fill_missing_cats(train_df)

In [None]:
ds.structdata.display_missing(train_df)

In [None]:
ds.structdata.describe(train_df)

In [None]:
ds.structdata.check_train_test_set(train_df, test_df, index='enrollee_id',col='gender'
                                )

In [None]:
ds.visualizations.boxplot(train_df, target='target')

In [None]:
dxp.count('target',data=train_df,cmap='blackbody',figsize=(10,5))

In [None]:
ds.visualizations.catbox(train_df, target='target')


In [None]:
alt.Chart(train_df).mark_bar(color="green").encode(
    x='education_level',
    y='training_hours',
    tooltip=['education_level','training_hours']
).properties(height=400,width=600,title="Bar-Plot")

In [None]:
city_grp=train_df[['city','city_development_index']].groupby('city').agg('sum').sort_values('city_development_index',ascending=False).reset_index()
city_grp = city_grp[:10]

In [None]:
bar = alt.Chart(city_grp).mark_bar().encode(
    x='city',
    y='city_development_index',
    tooltip=['city','city_development_index']
)

rule = alt.Chart(city_grp).mark_rule(color='red').encode(
    y='mean(city_development_index):Q',
    tooltip=['mean(city_development_index):Q']
)

(bar + rule).properties(height=400,width=600,title="Bar plot for City Developement Index")

In [None]:
dxp.count(val='target', data=train_df,split ='gender' ,figsize=(4,3))

In [None]:
dxp.count(val='education_level', data=train_df, split='enrolled_university', 
          orientation='h', stacked=True, col='major_discipline')

In [None]:
dxp.line(x='company_size', y='training_hours', data=train_df,split='education_level', aggfunc='median',figsize=(10,5))

In [None]:
alt.Chart(train_df).mark_bar().encode(
    x='average(training_hours)',
    y='education_level',
    color='enrolled_university'
).properties(height=400,width=600,title="Bar-Plot")

In [None]:
alt.Chart(train_df).mark_bar(opacity=0.7).encode(
    x='major_discipline:O',
    y=alt.Y('training_hours:Q', stack=None),
    color="education_level"
).properties(height=400,width=600,title="Bar-Plot")

In [None]:
plot_distplot(df=train_df, col='training_hours', hue='target', kind='kde',
              title='Distribution of traning hrs')

In [None]:
dxp.count('company_size',data = train_df,split = 'company_type',figsize=(10,8),title='Company Size with Company type')

In [None]:
plot_pie_chart(df=train_df, col='relevent_experience', explode=(0, 0.03),
               title='Pie chart for Experience')

In [None]:
plot_aggregation(df=train_df, group_col='company_type', value_col='city_development_index', aggreg='mean',
                 title='Plot for type of company with development index',figsize=(10,12))

In [None]:
dxp.count('enrolled_university',data = train_df,split = 'education_level',figsize=(10,8),title='Bar Plot for University Enrollment')

In [None]:
# pair plot
sns.pairplot(train_df)
plt.show()

In [None]:
exp = train_df.experience.value_counts()
plt.figure(figsize=(10,7))
sns.barplot(x=exp[:10].index,y=exp[:10].values)
plt.xticks(rotation=45)
plt.title('Most Experienced Data Scientist',color = 'red',fontsize=15)

In [None]:
jd = train_df.last_new_job.value_counts()
plt.figure(figsize=(10,7))
sns.barplot(x=jd[:10].index,y=jd[:10].values)
plt.xticks(rotation=45)
plt.title('Last Job (in years)',color = 'blue',fontsize=15)

## Step II Data Preprocessing

In [None]:
ds.structdata.display_missing(train_df)

In [None]:
ds.structdata.display_missing(test_df)

In [None]:
coorelation=train_df[["city","city_development_index","gender","relevent_experience","enrolled_university","education_level",
                   "major_discipline","experience","company_size","company_type","last_new_job","training_hours","target"]].corr()

sns.set(font_scale=1.10)
plt.figure(figsize=(15, 10))

sns.heatmap(coorelation,  linewidths=0.01,annot=True,linecolor="red",cmap='Wistia')
plt.title('Correlation between features');

In [None]:
for f in train_df.columns:
    if train_df[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values))

In [None]:
test_df = ds.feature_engineering.fill_missing_cats(test_df)
test_df = ds.feature_engineering.fill_missing_num(test_df)

In [None]:
for f in test_df.columns:
    if test_df[f].dtype=='object': 
        lblt = LabelEncoder()
        lblt.fit(list(test_df[f].values))
        test_df[f] = lblt.transform(list(test_df[f].values))

In [None]:
x = train_df.drop(['enrollee_id','target'], axis = 1)
y = train_df['target']
train_data_cols = x.columns.values.tolist()

print("Shape of x :", x.shape)
print("Shape of y :", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 7)
print("Shape of X_train :", X_train.shape)
print("Shape of X_test :", X_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)

In [None]:
from imblearn.over_sampling import ADASYN
oversample = ADASYN(random_state=42)
X_train_resam, y_train_resam = oversample.fit_resample(X_train, y_train)
counter = Counter(y_train)
counter_ = Counter(y_train_resam)

print('before oversampling',counter)
print('after oversampling' ,counter_)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Step III Model Building

In [None]:
from catboost import CatBoostClassifier,Pool
model = CatBoostClassifier(learning_rate=0.03)

model.fit(X_train_resam,
          y_train_resam,
          verbose=False,
          plot=True)


In [None]:
from sklearn.metrics import f1_score,roc_auc_score
y_pred_cat = model.predict(X_test)

print('F1 Score',f1_score(y_test, y_pred_cat,average = 'weighted'))
roc_score = roc_auc_score(y_test,y_pred_cat)
print("ROC AUC Score - ",roc_score)

## Step IV Hyper Param Search with Optuna

In [None]:
!pip install optuna -q

In [None]:
import optuna

In [None]:
def objective(trial):
    
    param = {
        'custom_loss': 'AUC', 
        'random_state': 101,
        'iterations' : trial.suggest_int('iterations', 1, 100),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.001,0.008,0.01,0.02,0.03,0.04,0.05]),
        'depth': trial.suggest_categorical('max_depth', [1,2,3,4,5]),
        'l2_leaf_reg' : trial.suggest_int('l2_leaf_reg', 1, 10),
    }
    model = CatBoostClassifier(**param)  
    
    model.fit(X_train_resam,y_train_resam,eval_set=[(X_test,y_test)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(X_test)
    auc = roc_auc_score(y_test,preds)
    
    return auc

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
tuned_params=study.best_params   
tuned_params

In [None]:
tunedmodel = CatBoostClassifier(**tuned_params)

tunedmodel.fit(X_train_resam,
          y_train_resam,
          verbose=False)

In [None]:
y_pred_tuned = tunedmodel.predict(X_test)

print('F1 Score',f1_score(y_test, y_pred_tuned,average = 'weighted'))
roc_score = roc_auc_score(y_test,y_pred_tuned,average='weighted')
print("ROC AUC Score - ",roc_score)

<img src = "https://media.giphy.com/media/WvENXbe6sHAfC/giphy.gif">