In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Edit, 05-Jan-2021, Deal with imbalancing data by oversampling -> accuracy, precision, recall, auc socre, and roc curve are improved

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
from imblearn.over_sampling import SMOTE

warnings.filterwarnings("ignore")
sns.set()
np.set_printoptions(threshold=sys.maxsize)

In [None]:
data = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
data.drop('enrollee_id', axis=1, inplace=True)
data.head(3)

- enrollee_id : Unique ID for candidate
- city: City code
- city_ development _index : Developement index of the city (scaled)
- gender: Gender of candidate
- relevent_experience: Relevant experience of candidate
- enrolled_university: Type of University course enrolled if any
- education_level: Education level of candidate
- major_discipline :Education major discipline of candidate
- experience: Candidate total experience in years
- company_size: No of employees in current employer's company
- company_type : Type of current employer
- lastnewjob: Difference in years between previous job and current job
- training_hours: training hours completed

target: 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
sns.countplot(data=data, x='target')
plt.show()

## Imbalanced dataset
A problem with imbalanced classification is that there are too few examples of the minority class for a model to effectively learn the decision boundary.

One way to solve this problem is to oversample the examples in the minority class. This can be achieved by simply duplicating examples from the minority class in the training dataset prior to fitting a model. This can balance the class distribution but does not provide any additional information to the model. 

Instead, new samples can be synthesized from the existing samples. This is a type of data augmentation for the minority class and is referred to as the Synthetic Minority Oversampling Technique, or SMOTE

SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and creating a new sample at a point along that line.

In [None]:
from sklearn.datasets import make_classification
X_ex, y_ex = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=111)

data_ex = pd.DataFrame(np.concatenate([X_ex, y_ex.reshape(-1,1)],axis=1))

zero = data_ex[data_ex[2]==0.0]
one = data_ex[data_ex[2]==1.0]

X_ex_oversampled, y_ex_oversampled = SMOTE().fit_resample(X_ex, y_ex)
data_ex_oversampled = pd.DataFrame(np.concatenate([X_ex_oversampled, y_ex_oversampled.reshape(-1,1)],axis=1))

zero_oversampled = data_ex_oversampled[data_ex_oversampled[2]==0.0]
one_oversampled = data_ex_oversampled[data_ex_oversampled[2]==1.0]

fig, ax = plt.subplots(1,2, sharey=True)
fig.set_size_inches(13,5)
ax[0].scatter(zero[0], zero[1], label='class 0')
ax[0].scatter(one[0], one[1], label='class 1')
ax[0].set_title('Original Data')
ax[0].set_ylabel('feature_1')
ax[0].set_xlabel('feature_0')
ax[1].scatter(zero_oversampled[0], zero_oversampled[1], label='class 0')
ax[1].scatter(one_oversampled[0], one_oversampled[1], label='class 1')
ax[1].set_title('Oversampled Data')
ax[1].set_xlabel('feature_0')
plt.legend(bbox_to_anchor=(1.3,1))
plt.show()

"SMOTE first selects a minority class instance a at random and finds its k nearest minority class neighbors. The synthetic instance is then created by choosing one of the k nearest neighbors b at random and connecting a and b to form a line segment in the feature space. The synthetic instances are generated as a convex combination of the two chosen instances a and b."

[reference](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

## Be aware of imbalancing, we'll deal with it later.

**First, I'll separate categorical columns and numerical columns into two DataFrame named "category", "numeric" respectively.**

In [None]:
numeric = data.select_dtypes(exclude='object')
category = data.select_dtypes(include='object')

# 1) Missing values

In [None]:
missing = pd.DataFrame(data.isnull().sum()/len(data), columns=['Missing'])

cm = sns.light_palette("green", as_cmap=True)
missing.style.background_gradient(cmap=cm)

### Deal with NaN in categorical features.

In [None]:
category_notNull = category.fillna('No')

### Clean the data a little bit.

In [None]:
category_notNull['company_size'] = category_notNull['company_size'].replace('10/49', '10-49')

# 2) Encode categorical features

## 2.1) Ordinal encode

In [None]:
from sklearn.preprocessing import OrdinalEncoder

Ordinal_encoder = OrdinalEncoder([
    ['No', 'Primary School',  'High School', 'Graduate', 'Masters', 'Phd'],
    'No,<1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,>20'.split(','),
    ['No','<10', '10-49', '50-99', '100-500', '500-999' ,'1000-4999', '5000-9999', '10000+'],
    ['No','1', '2', '3', '4', '>4', 'never']
])

category_notNull_ordinal = category_notNull[['education_level', 'experience', 'company_size', 'last_new_job']]

category_notNull_ordinalEncoded = Ordinal_encoder.fit_transform(category_notNull_ordinal)

## 2.2) One hot encode

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_how_columns = [ col for col in category_notNull.columns if col not in ['education_level', 'experience', 'company_size', 'last_new_job']]

ohe = OneHotEncoder(sparse=False).fit(category_notNull.loc[:, one_how_columns])

category_notNull_onehotEncoded = ohe.transform(category_notNull.loc[:, one_how_columns])

# 3) Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

category_preprocessed = np.concatenate([category_notNull_onehotEncoded, category_notNull_ordinalEncoded], axis=1)

X = np.concatenate([numeric.drop('target', axis=1).values, category_preprocessed], axis=1)
y = numeric['target'].values


X, y = SMOTE().fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# 4) Scale

In [None]:
X_train[:5,:5]

First two columns are numerical data, others is encoded data.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train[:,:2])

X_train_scaled = X_train.copy()
X_train_scaled[:,:2] = scaler.transform(X_train[:,:2])

X_test_scaled = X_test.copy()
X_test_scaled[:,:2] = scaler.transform(X_test[:,:2])

# 5) Building some models. <br>
I'll use default parameter.

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, roc_curve 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

estimators = {
    'Logistic Regression': [LogisticRegression()],
    'Decision Tree' :[DecisionTreeClassifier()],
    'Random Forest' :[RandomForestClassifier()],
    'Gradient Boost' :[GradientBoostingClassifier()],
    'XG Boost': [XGBClassifier()],
}


def mfit(estimators, X_train, y_train):
    for m in estimators:
        estimators[m][0].fit(X_train, y_train)
        print(m+' fitted')

mfit(estimators, X_train_scaled, y_train)

# 6) Let's predict!

In [None]:

def mpredict(estimators, X_test, y_test):
    outcome = dict()
    r_a_score = dict()
    for m in estimators:
        y_pred = estimators[m][0].predict(X_test)
        r_a_score[m] = roc_auc_score(y_test, y_pred)
        outcome[m] = [y_pred, confusion_matrix(y_pred,y_test), classification_report(y_pred,y_test)]
    return outcome, r_a_score

outcome, r_a_score = mpredict(estimators, X_test_scaled, y_test)

In [None]:
for m in outcome:
    print('------------------------'+m+'------------------------')
    print(outcome[m][1])
    print(outcome[m][2])

In [None]:
print('roc_auc_score')
for m in r_a_score:
    print('------------------------'+m+'------------------------')
    print(r_a_score[m])


# Looking to ROC curve

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(13,6)

for m in estimators:
    y_pred = estimators[m][0].predict_proba(X_test_scaled)
    fpr, tpr, _ = roc_curve(y_test, y_pred[:,1].ravel())
    plt.plot(fpr,tpr, label=m)
plt.xlabel('False-Positive rate')
plt.ylabel('True-Positive rate')
plt.legend()
plt.show()