# Import Libs and Data

In [None]:
!pip install pivottablejs

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from pivottablejs import pivot_ui

In [None]:
data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()*100/len(data)

In [None]:
data.shape

# Data Cleaning

In [None]:
data.columns

## enrollee_id

In [None]:
data['enrollee_id'].nunique()

In [None]:
data= data.drop('enrollee_id',axis=1)

## city

In [None]:
data.city = data.city.apply(lambda x: int(x.split('_')[1]))

## gender

In [None]:
data.gender.isna().sum()

In [None]:
data.gender.value_counts()

In [None]:
data.gender = data.gender.fillna('Male')

## enrolled_university

In [None]:
data.enrolled_university.isna().sum()

In [None]:
data.enrolled_university.value_counts()

In [None]:
data.enrolled_university = data.enrolled_university.fillna('no_enrollment')

## education_level

In [None]:
data.education_level.isna().sum()

In [None]:
data.education_level.value_counts()

In [None]:
data.education_level = data.education_level.fillna(method='ffill')

## major_discipline

In [None]:
data.major_discipline.isna().sum()

In [None]:
data.major_discipline.value_counts()

In [None]:
data.major_discipline = data.major_discipline.fillna('STEM')

## experience

In [None]:
data.experience.isna().sum()

In [None]:
data.experience.value_counts()

In [None]:
data.experience = data.experience.fillna(method='ffill')

## company_size

In [None]:
data.company_size.isna().sum()

In [None]:
data.company_size.value_counts()

In [None]:
data.company_size = data.company_size.fillna(method = 'ffill')

In [None]:
data.iloc[0,8] = '50-99'

In [None]:
data.company_size.isna().sum()

## company_type

In [None]:
data.company_type.unique()

In [None]:
data.company_type.value_counts()

In [None]:
data.company_type = data.company_type.fillna('Pvt Ltd')

## last_new_job

In [None]:
data.last_new_job.unique()

In [None]:
data.last_new_job.value_counts()

In [None]:
data.last_new_job = data.last_new_job.fillna(method = 'bfill')

# EDA

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
sns.pairplot(data, hue='target', markers = ['s', 'o'])
plt.plot()

In [None]:
plt.figure(figsize=(10,8))
sns.clustermap(data.corr(),annot = True)
plt.show()

In [None]:
fx = list(data.target.value_counts().index)
fy = data.target.value_counts().values

fig = px.bar(x=fx, y=fy)
fig.show()

In [None]:
def pie_chart(data,col):
    
    targets = list(data[col].value_counts().index)
    values = list((data[col].value_counts()*100/len(data[col])).values)

    fig = px.pie(
        values=values, 
        names=targets,
        color_discrete_sequence=['darkcyan', 'lawngreen'],
        title=col
    )
    fig.show()

In [None]:
# categorical features
categorical_columns = list(data.select_dtypes(include=['object']).dtypes.index)
print(categorical_columns)

In [None]:
for col in categorical_columns:
    pie_chart(data,col)

# Feature Engineering

In [None]:
# numeric features
numeric_columns = list(data.select_dtypes(exclude=['object']).dtypes.index)
print(numeric_columns)

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(data['training_hours'],fit=norm)
plt.title('skew: {}'.format(skew(data['training_hours'])))

In [None]:
plt.figure(figsize = (8,6))
stats.probplot(data['training_hours'], plot = plt)
plt.show()

In [None]:
# box-cox transform lambda = 0.15 efficently

data['training_hours'] = boxcox1p(data['training_hours'], 0.15)
plt.figure(figsize=(8,6))
sns.distplot(data['training_hours'], fit = norm)
plt.show()
stats.probplot(data['training_hours'], plot = plt)
plt.show()
print('skew: {}'.format(skew(data['training_hours'])))

In [None]:
fig = px.box(data, y="training_hours")
fig.show()

## Encoding

I don't want all encoding in one for loop because I'm gonna use 2 diffrent encoding.(Label Encoding, Ordinal Encoding)

In [None]:
data.head()

### gender

In [None]:
le = LabelEncoder()
data.gender = le.fit_transform(data['gender'])

In [None]:
data

### relevent_experience

In [None]:
le = LabelEncoder()
data.relevent_experience = le.fit_transform(data['relevent_experience'])

### enrolled_university

In [None]:
le = LabelEncoder()
data.enrolled_university = le.fit_transform(data['enrolled_university'])

### education_level

In [None]:
data.education_level.unique()

In [None]:
data.education_level.value_counts()

In [None]:
ordinal_education_level = {'Primary School':0, 'High School':1, 'Graduate':2, 'Masters':3, 'Phd':4}
data.education_level = data.education_level.map(ordinal_education_level)

### major_discipline

In [None]:
le = LabelEncoder()
data.major_discipline = le.fit_transform(data['major_discipline'])

### experience

In [None]:
data.experience.value_counts()

In [None]:
data.experience.unique()

In [None]:
ordinal_experience = {'<1':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, '10':10,
                      '11':11, '12':12, '13':13, '14':14, '15':15, '16':16, '17':17, '18':18, '19':19, '20':20, '>20':21}

data.experience = data.experience.map(ordinal_experience)

### company_size

In [None]:
data.company_size.value_counts()

In [None]:
ordinal_company_size = {'<10':0, '10/49':1, '50-99':2, '100-500':3, '500-999':4, '1000-4999':5, '5000-9999':6, '10000+':7}
data.company_size = data.company_size.map(ordinal_company_size)

### company_type

In [None]:
le = LabelEncoder()
data.company_type = le.fit_transform(data['company_type'])

### last_new_job

In [None]:
data.last_new_job.value_counts()

In [None]:
ordinal_last_new_job = {'never':0, '1':1, '2':2, '3':3, '4':4, '>4':5}
data.last_new_job = data.last_new_job.map(ordinal_last_new_job)

# Over Sampling & Train - Test Split

In [None]:
data.head()

In [None]:
y = data.target
X = data.drop('target',axis=1)

sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X,y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.25, random_state=42)

### Scale Data

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build Model

## Choose Base Model

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb

In [None]:
models = []
models.append(('Naive Bayes', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state = 42)))
models.append(('Random Forest', RandomForestClassifier(random_state = 42)))
models.append(('SVM', SVC(gamma='auto', random_state = 42)))
models.append(('XGBoost', GradientBoostingClassifier(random_state = 42)))
models.append(("CatBoost", CatBoostClassifier(random_state = 42, verbose = False)))
models.append(('GradientBoosting', GradientBoostingClassifier(random_state = 42)))

# evaluate each model in turn
results = []
names = []

In [None]:
for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("{} : {}".format(name,accuracy))

In [None]:
lgb = lgb.LGBMClassifier(n_estimator=100, silent=False)
lgb.fit(X_train,y_train)
y_pred = lgb.predict(X_test)
print("LGBM Classifier Accuracy Score : {}".format(accuracy_score(y_test, y_pred)))

## Hyperparameter tuning

I choosed CatBoostClassifier and we will find best parameters

In [None]:
params = {
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
xgb = CatBoostClassifier(learning_rate=0.02, n_estimators=600, verbose = False)

In [None]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=42)

random_search.fit(X_train, y_train)

In [None]:
print(random_search.best_params_)

In [None]:
model = CatBoostClassifier(learning_rate=0.02, n_estimators=600, verbose = False, subsample = 0.8, max_depth = 5)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

# Inspiration

## Predict the probability of probability of a candidate will work for the company

In [None]:
y_pred = model.predict_proba(X_test)
print(y_pred)

## Which features affects candidate decision.

In [None]:
print(model.feature_importances_)

In [None]:
importances = model.feature_importances_
features = data.columns[:12]
imp = pd.DataFrame({'Features': features, 'Importance': importances})
imp['Sum Importance'] = imp['Importance'].cumsum()
imp = imp.sort_values(by = 'Importance')
imp

In [None]:
plt.figure(figsize=(12,8))
plt.barh(imp['Features'], imp['Importance'], color = 'g')
l1 = plt.axhline(len(imp) - (len(imp['Features'][imp['Sum Importance'] < 0.50]) + 1.5), linestyle='-.', color = 'r')
l2 = plt.axhline(len(imp) - (len(imp['Features'][imp['Sum Importance'] < 0.90]) + 1.5), linestyle='--', color = 'r')
l3 = plt.axhline(len(imp) - (len(imp['Features'][imp['Sum Importance'] < 0.99]) + 1.5), linestyle='-', color = 'r')
plt.legend(title = 'Cut-offs of acumulated importance', handles=(l1, l2, l3), labels = ('50%', '90%', '99%'))
plt.title('Feature importance in group assignment')
plt.show()

# Bonus

Interactive Pivot Table

In [None]:
pivot_ui(data)