In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data Loading

In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

# 2. Data Preprocessing

## 2.1 Missing Value

### Count Missing value by features

In [None]:
missing = train.isnull().sum().sort_values(ascending=False).reset_index()
missing.columns = ['features','missing_num']
missing['percentage'] = missing['missing_num']/train.shape[0]
missing

In [None]:
print('Among a total of 13 features,' + str((missing['missing_num']>0).sum())+ ' features contains missing values.')
print('And ' + str((missing['percentage']>0.3).sum()) + ' features contains over 30% missing values.')

### Remove features that have too many missing values

In [None]:
# here, set the threshold to 30%
thr = 0.7*train.shape[0]
train2 = train.dropna(thresh = thr, axis = 1) #drop columns with too many missing val
train2.isnull().sum().sort_values(ascending=False).reset_index()

### Fill in values for remaining features 

In [None]:
train2.dtypes

Since features with missing values are all categorical, we can use mode or modeling (such as random forest) to fill in. Here, we simply use mode.

In [None]:
#gender
train2.loc[train2['gender'].isnull(),'gender'] = train2['gender'].value_counts().index[0]

#major_discipline
train2.loc[train2['major_discipline'].isnull(),'major_discipline'] = train2['major_discipline'].value_counts().index[0]

#education_level
train2.loc[train2['education_level'].isnull(),'education_level'] = train2['education_level'].value_counts().index[0]

#last_new_job
train2.loc[train2['last_new_job'].isnull(),'last_new_job'] = train2['last_new_job'].value_counts().index[0]

#enrolled_university
train2.loc[train2['enrolled_university'].isnull(),'enrolled_university'] = train2['enrolled_university'].value_counts().index[0]

#experience
train2.loc[train2['experience'].isnull(),'experience'] = train2['experience'].value_counts().index[0]

In [None]:
train2.isnull().sum().sort_values(ascending=False).reset_index()

## 2.2 Outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train2.dtypes

In [None]:
#city_development_index
plt.boxplot(train2['city_development_index'],vert=False)
plt.show()

In [None]:
q3 =  train2['city_development_index'].describe()['75%']
q1 =  train2['city_development_index'].describe()['25%']
iqr = q3-q1
train2.loc[train2['city_development_index'] < q1 - 1.5*iqr,'city_development_index']= q1-1.5*iqr #reset outliers

In [None]:
#training_hours 
plt.boxplot(train2['training_hours'],vert=False)
plt.show()

In [None]:
q3 =  train2['training_hours'].describe()['75%']
q1 =  train2['training_hours'].describe()['25%']
iqr = q3-q1
train2.loc[train2['training_hours'] < q1 - 1.5*iqr,'training_hours']= 200 #reset outliers

In [None]:
train2.head()

# 3. Feature Engineering - Encoding

- One-hot encoding for gender, enrolled_university, major_discipline (nominal)
- Hash encoding for city (deal with high cardinality)
- Label encoding for relevent experience,education_level, experience, last_new_job (ordinal)


More on Encoding: https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/

In [None]:
#one-hot encoding
train2 = pd.get_dummies(train2, columns=['gender','enrolled_university','major_discipline'])
#ohe_test = pd.get_dummies(test, columns=['gender','enrolled_university','major_discipline'])
#train2,test = train2.align(ohe_test,join='left',axis=1)

In [None]:
#hash encoding
import category_encoders as ce
encoder_city = ce.HashingEncoder(cols=['city'])
city_he = encoder_city.fit_transform(train2['city'], train2['target'])
train2=train2.drop(columns=['city'])
train2= pd.concat([train2, city_he],axis=1)

#city_he_test = encoder_city.transform(test['city'], test['target'])
#test = test.drop(columns=['city'])
#test = pd.concat([test, city_he_test], axis=1)

In [None]:
#label encoding
from sklearn.preprocessing import LabelEncoder as le
from collections import defaultdict
d = defaultdict(le)

le_train = train2[['relevent_experience','education_level','experience','last_new_job']].apply(lambda x: d[x.name].fit_transform(x),axis=0)
#le_test = test[['relevent_experience','education_level','experience','last_new_job']].apply(lambda x: d[x.name].transform(x) if type(x) == str else x)

In [None]:
train2 = train2.drop(columns=['relevent_experience','education_level','experience','last_new_job'])
train2 = pd.concat([train2,le_train],axis=1)

#test = test.drop(columns=['relevent_experience','education_level','experience','last_new_job'])
#test = pd.concat([test, le_test], axis=1)

In [None]:
train2.head()

# 4. Oversampling Using SMOTE

In [None]:
from collections import Counter #summerize class distribution
from imblearn.over_sampling import SMOTE

X = train2.drop(columns=['target', 'enrollee_id'])
y = train2['target']

#summerize class distribution: before
counter = Counter(y)
print(counter)

#Oversampling using SMOTE
smt = SMOTE(random_state=42)
X,y = smt.fit_sample(X,y)

#summerize class distribution: after
counter = Counter(y)
print(counter)

# 5. Modeling

In [None]:
#create training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#create models
from sklearn.metrics import confusion_matrix,accuracy_score

def Model(model, X_train, X_test, y_train, y_test, title):
    
    #train
    model.fit(X_train, y_train)
    
    #predict
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    #evaluate
    print(title + ' - training set - accuracy score: ', accuracy_score(y_train, y_train_pred))
    print(title + ' - test set - accuracy score: ' , accuracy_score(y_test, y_test_pred))
    print(title + ' - training set - confusion matrix: \n' , confusion_matrix(y_train, y_train_pred))
    print(title + ' - test set - confusion matrix: \n' ,confusion_matrix(y_test, y_test_pred))
  
    
#find important features

def ImportantFeatures(model):
    model.fit(X_train, y_train)
    importances = model.feature_importances_
    features = X_train.columns.values
    imp = pd.DataFrame({'Features': features, 'Importance': importances})
    imp.sort_values(by='Importance')
    
    return imp

## 5.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
Model(LogisticRegression(solver='lbfgs', max_iter=10000,random_state=42),X_train, X_test, y_train, y_test, 'Logistic Regression w/ SMOTE')

## 5.2 SVM

In [None]:
from sklearn.svm import SVC
Model(SVC(random_state=42), X_train, X_test, y_train, y_test, 'SVM w/ SMOTE')

## 5.3 GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
Model(GaussianNB(), X_train, X_test, y_train, y_test, 'GaussianNB w/ SMOTE')

## 5.4 KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
Model(KNeighborsClassifier(), X_train, X_test, y_train, y_test, 'KNN w/ SMOTE')

## 5.5 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
Model(DecisionTreeClassifier(max_depth=8), X_train, X_test, y_train, y_test, 'Decision Tree w/ SMOTE')

In [None]:
ImportantFeatures(DecisionTreeClassifier(max_depth=14))

## 5.6 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
Model(RandomForestClassifier(max_features=8,n_estimators=4000,max_depth=10,random_state=42), X_train, X_test, y_train, y_test, 'RandomForest w/ SMOTE')

In [None]:
ImportantFeatures(RandomForestClassifier(max_features=8,n_estimators=4000,max_depth=10,random_state=42))

## 5.7 XGBoost

In [None]:
from xgboost import XGBClassifier

Model(XGBClassifier(random_state=42), X_train, X_test, y_train, y_test, 'XGBoost w/ SMOTE')

# 6. Conclusion

## Top 3 Models

1. XGBoost (training accuracy: ~87%, test accuracy: ~83%)
2. Logistic Regression (training/test accuracy: ~82%)
3. Random Forest(training accuracy: ~84%, test accuracy: ~81%)


## The Most important Factor - city_development_index (Based on Random Forest)


=================================

### Key Takeaway

- how to handle missing values 
- categorical encoding (https://www.analyticsvidhya.com/blog/2020/08/types-of-categorical-data-encoding/)
- deal with unbalanced data (SMOTE: https://towardsdatascience.com/5-smote-techniques-for-oversampling-your-imbalance-data-b8155bdbe2b5#:~:text=Borderline%2DSMOTE%20is%20a%20variation,boundary%20between%20the%20two%20classes.)


Lastly, huge thanks to Huynh Dong Nguyen's notebook! Learned a lot from it!