# Prediction of Data Scientists' Job Change

1. Binary Classification Problem: Predict "Change" or "Not Change"**
2. The data contains a lot of missing values, and the target variable is imbalanced.
3. Apply various algorithm using various methods of handling missing values and data rebalancing.
  - Methods for Handling missing data: 1) Dropping, 2) Mode imputation, 3) KNN imputation, 4) No imputation
  - Data Rebalancing a) Imbalanced data, b) SMOTE, c) Scale Pos Weight  
4. Algorithm:
 - Logistic Regression (1, 2, 3, a, b)
 - Random Forest (2, 3, a, b)
 - XGBoostn (2, 3, 4, a, b, c)
 - LightGBM (4, a, c)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load and Explore Dataset

In [None]:
# Import the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score,recall_score

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
sns.set(color_codes=True)
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load Dataset
df_train = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
df_test = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")

# Display 5 rows of the training data
df_train.head()

In [None]:
# Display dataset shape
print("train shape: ",df_train.shape)
print("test shape : ", df_test.shape)

In [None]:
# Display descriptive statistics of training data
df_train.describe()

In [None]:
# Display training data information
df_train.info()

- "gender","enrolled_university","major_discipline","experience","company_size","company_type","last_new_job" contain missing values.
- Majory of the variables are categorical variables. They need to be encoded.

## Data Preprocessing

In [None]:
# Create a list of columns which will be used in modelling
columns = ['city',
           'city_development_index',
           'gender',
           'relevent_experience',
           'enrolled_university',
           'education_level',
           'major_discipline',
           'experience',
           'company_size',
           'company_type',
           'last_new_job',
           'training_hours']

In [None]:
# Encode categorical variables

# Import package
from sklearn.preprocessing import OrdinalEncoder

# Instantiate encoder
encoder = OrdinalEncoder()

# Define a function for encoding
def encode(train_data, test_data):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    train_data_nonulls = np.array(train_data.dropna())
    test_data_nonulls = np.array(test_data.dropna())
    #reshapes the data for encoding
    train_impute_reshape = train_data_nonulls.reshape(-1,1)
    test_impute_reshape = test_data_nonulls.reshape(-1,1)
    #encode date
    train_impute_ordinal = encoder.fit_transform(train_impute_reshape)
    test_impute_ordinal = encoder.transform(test_impute_reshape)
    #Assign back encoded values to non-null values
    train_data.loc[train_data.notnull()] = np.squeeze(train_impute_ordinal)
    test_data.loc[test_data.notnull()] = np.squeeze(test_impute_ordinal)
    return train_data, test_data

# Apply encoding funtion using a for loop to apply it to each column
for column in columns:
    df_train[column], df_test[column] = encode(df_train[column], df_test[column])

In [None]:
# Check if the columns were encoded properly
df_train.head()

The categorical variables were encoded to numeric.

In [None]:
# Check the balance of target value
df_train.target.value_counts()

- The data is imbalanced.

In [None]:
# Check each variable's data type
df_train.info()

THe data still contains object data. So let's convert them into float manually.

In [None]:
# Convert object data into float
df_train['gender'] = df_train['gender'].astype(float)
df_train['enrolled_university'] = df_train['enrolled_university'].astype(float)
df_train['education_level'] = df_train['education_level'].astype(float)
df_train['major_discipline'] = df_train['major_discipline'].astype(float)
df_train['experience'] = df_train['experience'].astype(float)
df_train['company_size'] = df_train['company_size'].astype(float)
df_train['company_type'] = df_train['company_type'].astype(float)
df_train['last_new_job'] = df_train['last_new_job'].astype(float)

In [None]:
# Check the data type
df_train.info()

Object variables were converted into float.

# 1. Logistic Regression with Data: Observations with missing values were dropped

- First, let's make prediction using logistic regression.
- Here, let's drop the observations with one or more missing values.

### Keep only rows without missing values

In [None]:
# Drop rows with one or more missing values
# Keep only rows that have no missing values
df_train_nonmis = pd.DataFrame(df_train).dropna(how='any', axis=1)
print(df_train_nonmis.info())

Only 6 variables remain.

In [None]:
# Separate the dataset into features and target
Xnonmis = df_train_nonmis.drop(['enrollee_id','city','target'],axis=1)
ynonmis = df_train_nonmis['target']

# Split the data into train and test
# Since test data does not contain the target, split the training data into train and test to evaluate the model performance 
Xnonmis_train, Xnonmis_test, ynonmis_train, ynonmis_test = train_test_split(Xnonmis, ynonmis, test_size=.30, 
                                                                            stratify=ynonmis, random_state=101)

In [None]:
# Scale the data

# Standardize the columns the values of which are out of 0-1 range
scaler = StandardScaler().fit(Xnonmis_train)

Xnonmis_train = scaler.transform(Xnonmis_train)
Xnonmis_test = scaler.transform(Xnonmis_test)

### Make Prediction

In [None]:
# Initiate the model
nonmis_lm = LogisticRegression()

# Fit the model
nonmis_lm.fit(Xnonmis_train, ynonmis_train.ravel())

# Make Predictions
nonmis_lm_pred = nonmis_lm.predict(Xnonmis_test)

# Calculate Accuracy, Precision, Recall, and F1 score
nonmis_lm_accuracy = accuracy_score(ynonmis_test, nonmis_lm_pred)
nonmis_lm_precision = precision_score(ynonmis_test, nonmis_lm_pred)
nonmis_lm_recall = recall_score(ynonmis_test, nonmis_lm_pred)
nonmis_lm_f1 = 2 * (nonmis_lm_precision * nonmis_lm_recall) / (nonmis_lm_precision + nonmis_lm_recall)

# Calculate AUC score
nonmis_lm_probs = nonmis_lm.predict_proba(Xnonmis_test)
nonmis_lm_probs = nonmis_lm_probs[:,1]
nonmis_lm_auc = roc_auc_score(ynonmis_test, nonmis_lm_probs)

# Display the metrics
print("Logistic Regression: Rows without missing values")
print(" - Accuracy : ",'{:.3f}'.format(nonmis_lm_accuracy))
print(" - Recall   : ",'{:.3f}'.format(nonmis_lm_recall))
print(" - F1 score : ",'{:.3f}'.format(nonmis_lm_f1))
print(" - AUC score: ",'{:.3f}'.format(nonmis_lm_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(ynonmis_test,nonmis_lm_pred))

- The model contains only some of the whole variables. So it lost much information.
- So, next, let's impute missing variables.

# 2. Prediction with Mode Imputation

- Let's imput missing values by a traditional way: since the focal variables are categorical, let's use the mode for imputation.
- As we saw above, "gender","enrolled_university","major_discipline","experience","company_size","company_type","last_new_job" contain missing values. So let's check the distribution of these variables.

## Mode Imputation

In [None]:
# Create countplot for "gender"
fig, ax =plt.subplots(4,2,figsize=(20,16))
sns.countplot(df_train['gender'], ax=ax[0,0])
sns.countplot(df_train['enrolled_university'], ax=ax[0,1])
sns.countplot(df_train['major_discipline'], ax=ax[1,0])
sns.countplot(df_train['experience'], ax=ax[1,1])
sns.countplot(df_train['company_size'], ax=ax[2,0])
sns.countplot(df_train['company_type'], ax=ax[2,1])
sns.countplot(df_train['last_new_job'], ax=ax[3,0])

fig.show()

In [None]:
# Impute missing values with mode
df_imp = df_train
df_imp.fillna(df_imp.mode().iloc[0],inplace=True)
df_imp.info()

Missing values were imputed successfully.

In [None]:
# Create dummies for nominal variables
df_imp = pd.concat([df_imp,pd.get_dummies(df_imp["gender"], prefix="gender")], 
                   axis=1).drop(columns=["gender"])
df_imp = pd.concat([df_imp,pd.get_dummies(df_imp["enrolled_university"], prefix="enrolled_university")], 
                   axis=1).drop(columns=["enrolled_university"])
df_imp = pd.concat([df_imp,pd.get_dummies(df_imp["major_discipline"], prefix="major_discipline")], 
                   axis=1).drop(columns=["major_discipline"])
df_imp = pd.concat([df_imp,pd.get_dummies(df_imp["company_type"], prefix="company_type")], 
                   axis=1).drop(columns=["company_type"])

In [None]:
# Separate the dataset into features and target
X_imp = df_imp.drop(['enrollee_id','target'],axis=1)
y_imp = df_imp['target']

# Split the data into train and test
# Since test data does not contain the target, split the training data into train and test to evaluate the model performance 
X_imp_train, X_imp_test, y_imp_train, y_imp_test = train_test_split(X_imp, y_imp, test_size=.30,stratify=y_imp, random_state=101)

In [None]:
# Scale the data

# Standardize the columns the values of which are out of 0-1 range
scaler = StandardScaler().fit(X_imp_train)

X_imp_train = scaler.transform(X_imp_train)
X_imp_test = scaler.transform(X_imp_test)

## 2.1. Prediction with Imbalanced Data: Mode Imputation

### 2.1.1. Logistic Regression: Mode Imputation - Imbalanced Data

In [None]:
# Initiate the model
imp_lm = LogisticRegression()

# Fit the model
imp_lm.fit(X_imp_train, y_imp_train.ravel())

# Make Predictions
imp_lm_pred = imp_lm.predict(X_imp_test)

# Calculate Accuracy, Precision, Recall, and F1 score
imp_lm_accuracy = accuracy_score(y_imp_test, imp_lm_pred)
imp_lm_precision = precision_score(y_imp_test, imp_lm_pred)
imp_lm_recall = recall_score(y_imp_test, imp_lm_pred)
imp_lm_f1 = 2 * (imp_lm_precision * imp_lm_recall) / (imp_lm_precision + imp_lm_recall)

# Calculate AUC score
imp_lm_probs = imp_lm.predict_proba(X_imp_test)
imp_lm_probs = imp_lm_probs[:,1]
imp_lm_auc = roc_auc_score(y_imp_test, imp_lm_probs)

# Display the metrics
print("Logistic Regression: Imputation with Mode")
print(" - Accuracy : ",'{:.3f}'.format(imp_lm_accuracy))
print(" - Recall   : ",'{:.3f}'.format(imp_lm_recall))
print(" - F1 score : ",'{:.3f}'.format(imp_lm_f1))
print(" - AUC score: ",'{:.3f}'.format(imp_lm_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_imp_test,imp_lm_pred))

### 2.1.2. Random Forest: Mode Imputation - Imbalanced Data

In [None]:
# Initiate the model
imp_rf = RandomForestClassifier()

# Fit the model
imp_rf.fit(X_imp_train, y_imp_train.ravel())

# Make Predictions
imp_rf_pred = imp_rf.predict(X_imp_test)

# Calculate Accuracy, Precision, Recall, and F1 score
imp_rf_accuracy = accuracy_score(y_imp_test, imp_rf_pred)
imp_rf_precision = precision_score(y_imp_test, imp_rf_pred)
imp_rf_recall = recall_score(y_imp_test, imp_rf_pred)
imp_rf_f1 = 2 * (imp_rf_precision * imp_rf_recall) / (imp_rf_precision + imp_rf_recall)

# Calculate AUC score
imp_rf_probs = imp_rf.predict_proba(X_imp_test)
imp_rf_probs = imp_rf_probs[:,1]
imp_rf_auc = roc_auc_score(y_imp_test, imp_rf_probs)

# Display the metrics
print("Random Forest: Imputation with Mode")
print(" - Accuracy : ",'{:.3f}'.format(imp_rf_accuracy))
print(" - Recall   : ",'{:.3f}'.format(imp_rf_recall))
print(" - F1 score : ",'{:.3f}'.format(imp_rf_f1))
print(" - AUC score: ",'{:.3f}'.format(imp_rf_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_imp_test,imp_rf_pred))

### 2.1.3. XGBoost: Mode Imputation - Imbalanced Data

In [None]:
# Initiate the model
imp_xgb = XGBClassifier()

# Fit the model
imp_xgb.fit(X_imp_train, y_imp_train.ravel())

# Make Predictions
imp_xgb_pred = imp_xgb.predict(X_imp_test)

# Calculate Accuracy, Precision, Recall, and F1 score
imp_xgb_accuracy = accuracy_score(y_imp_test, imp_xgb_pred)
imp_xgb_precision = precision_score(y_imp_test, imp_xgb_pred)
imp_xgb_recall = recall_score(y_imp_test, imp_xgb_pred)
imp_xgb_f1 = 2 * (imp_xgb_precision * imp_xgb_recall) / (imp_xgb_precision + imp_xgb_recall)

# Calculate AUC score
imp_xgb_probs = imp_xgb.predict_proba(X_imp_test)
imp_xgb_probs = imp_xgb_probs[:,1]
imp_xgb_auc = roc_auc_score(y_imp_test, imp_xgb_probs)

# Display the metrics
print("XGBoost: Imputation with Mode")
print(" - Accuracy : ",'{:.3f}'.format(imp_xgb_accuracy))
print(" - Recall   : ",'{:.3f}'.format(imp_xgb_recall))
print(" - F1 score : ",'{:.3f}'.format(imp_xgb_f1))
print(" - AUC score: ",'{:.3f}'.format(imp_xgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_imp_test,imp_xgb_pred))

## 2.2. Prediction with Mode Imputation & SMOTE

### Apply SMOTE

In [None]:
# Since this is an imbalanced data, apply SMOTE to the training set
from imblearn.over_sampling import SMOTE
smote=SMOTE()
X_smote_imp_train, y_smote_imp_train = smote.fit_sample(X_imp_train,y_imp_train)

# Check if SMOTE were properly applied
y_smote_imp_train.value_counts()

### 2.2.1. Logistic Regression: Mode Imputation - SMOTE

In [None]:
# Initiate the model
smote_imp_lm = LogisticRegression()

# Fit the model
smote_imp_lm.fit(X_smote_imp_train, y_smote_imp_train.ravel())

# Make Predictions
smote_imp_lm_pred = smote_imp_lm.predict(X_imp_test)

# Calculate Accuracy, Precision, Recall, and F1 score
smote_imp_lm_accuracy = accuracy_score(y_imp_test, smote_imp_lm_pred)
smote_imp_lm_precision = precision_score(y_imp_test, smote_imp_lm_pred)
smote_imp_lm_recall = recall_score(y_imp_test, smote_imp_lm_pred)
smote_imp_lm_f1 = 2 * (smote_imp_lm_precision * smote_imp_lm_recall) / (smote_imp_lm_precision + smote_imp_lm_recall)

# Calculate AUC score
smote_imp_lm_probs = smote_imp_lm.predict_proba(X_imp_test)
smote_imp_lm_probs = smote_imp_lm_probs[:,1]
smote_imp_lm_auc = roc_auc_score(y_imp_test, smote_imp_lm_probs)

# Display the metrics
print("Logistic Regression: Imputation with Mode: SMOTE")
print(" - Accuracy : ",'{:.3f}'.format(smote_imp_lm_accuracy))
print(" - Recall   : ",'{:.3f}'.format(smote_imp_lm_recall))
print(" - F1 score : ",'{:.3f}'.format(smote_imp_lm_f1))
print(" - AUC score: ",'{:.3f}'.format(smote_imp_lm_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_imp_test,smote_imp_lm_pred))

### 2.2.2. Random Forest: Mode Imputation - SMOTE

In [None]:
# Initiate the model
smote_imp_rf = RandomForestClassifier()

# Fit the model
smote_imp_rf.fit(X_smote_imp_train, y_smote_imp_train.ravel())

# Make Predictions
smote_imp_rf_pred = smote_imp_rf.predict(X_imp_test)

# Calculate Accuracy, Precision, Recall, and F1 score
smote_imp_rf_accuracy = accuracy_score(y_imp_test, smote_imp_rf_pred)
smote_imp_rf_precision = precision_score(y_imp_test, smote_imp_rf_pred)
smote_imp_rf_recall = recall_score(y_imp_test, smote_imp_rf_pred)
smote_imp_rf_f1 = 2 * (smote_imp_rf_precision * smote_imp_rf_recall) / (smote_imp_rf_precision + smote_imp_rf_recall)

# Calculate AUC score
smote_imp_rf_probs = smote_imp_rf.predict_proba(X_imp_test)
smote_imp_rf_probs = smote_imp_rf_probs[:,1]
smote_imp_rf_auc = roc_auc_score(y_imp_test, smote_imp_rf_probs)

# Display the metrics
print("Random Forest: Imputation with Mode: SMOTE")
print(" - Accuracy : ",'{:.3f}'.format(smote_imp_rf_accuracy))
print(" - Recall   : ",'{:.3f}'.format(smote_imp_rf_recall))
print(" - F1 score : ",'{:.3f}'.format(smote_imp_rf_f1))
print(" - AUC score: ",'{:.3f}'.format(smote_imp_rf_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_imp_test,smote_imp_rf_pred))

### 2.2.3. XGBoost: Mode Imputation - SMOTE

In [None]:
# Initiate the model
smote_imp_xgb = XGBClassifier()

# Fit the model
smote_imp_xgb.fit(X_smote_imp_train, y_smote_imp_train.ravel())

# Make Predictions
smote_imp_xgb_pred = smote_imp_xgb.predict(X_imp_test)

# Calculate Accuracy, Precision, Recall, and F1 score
smote_imp_xgb_accuracy = accuracy_score(y_imp_test, smote_imp_xgb_pred)
smote_imp_xgb_precision = precision_score(y_imp_test, smote_imp_xgb_pred)
smote_imp_xgb_recall = recall_score(y_imp_test, smote_imp_xgb_pred)
smote_imp_xgb_f1 = 2 * (smote_imp_xgb_precision * smote_imp_xgb_recall) / (smote_imp_xgb_precision + smote_imp_xgb_recall)

# Calculate AUC score
smote_imp_xgb_probs = smote_imp_xgb.predict_proba(X_imp_test)
smote_imp_xgb_probs = smote_imp_xgb_probs[:,1]
smote_imp_xgb_auc = roc_auc_score(y_imp_test, smote_imp_xgb_probs)

# Display the metrics
print("XGBoost: Imputation with Mode: SMOTE")
print(" - Accuracy : ",'{:.3f}'.format(smote_imp_xgb_accuracy))
print(" - Recall   : ",'{:.3f}'.format(smote_imp_xgb_recall))
print(" - F1 score : ",'{:.3f}'.format(smote_imp_xgb_f1))
print(" - AUC score: ",'{:.3f}'.format(smote_imp_xgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_imp_test,smote_imp_xgb_pred))

# 3. Prediction with KNN Imputation

## KNN Imputation

In [None]:
# Impute missing values with KNN
from sklearn.impute import KNNImputer
from numpy import isnan

# Copy the dataset for KNN imputation
df_knn = df_train

# Separate the dataset into features and target
X_knn = df_knn.drop(['enrollee_id','city','target'],axis=1)
y_knn = df_knn['target']

# Define imputer
imputer = KNNImputer()

# fit on the dataset
imputer.fit(X_knn)

# Transform the dataset
X_knn = imputer.transform(X_knn)

# Convert to dataframe
X_knn = pd.DataFrame(X_knn)
X_knn.columns = ["city_development_index","gender","relevent_experience","enrolled_university","education_level","major_discipline",
                 "experience","company_size","company_type","last_new_job","training_hours"]
X_knn.info()

In [None]:
# Create dummies for nominal variables
X_knn = pd.concat([X_knn,pd.get_dummies(X_knn["gender"], prefix="gender")], 
                   axis=1).drop(columns=["gender"])
X_knn = pd.concat([X_knn,pd.get_dummies(X_knn["enrolled_university"], prefix="enrolled_university")], 
                   axis=1).drop(columns=["enrolled_university"])
X_knn = pd.concat([X_knn,pd.get_dummies(X_knn["major_discipline"], prefix="major_discipline")], 
                   axis=1).drop(columns=["major_discipline"])
X_knn = pd.concat([X_knn,pd.get_dummies(X_knn["company_type"], prefix="company_type")], 
                   axis=1).drop(columns=["company_type"])

In [None]:
# Split the data into train and test
X_knn_train, X_knn_test, y_knn_train, y_knn_test = train_test_split(X_knn, y_knn, test_size=.30,stratify=y_knn, random_state=101)

## 3.1. Prediction with KNN Imputation - Imbalanced Data

### 3.1.2. Logistic Regression: KNN Imputation - Imbalanced Data

In [None]:
# Initiate the model
knn_lm = LogisticRegression()

# Fit the model
knn_lm.fit(X_knn_train, y_knn_train.ravel())

# Make Predictions
knn_lm_pred = knn_lm.predict(X_knn_test)

# Calculate Accuracy, Precision, Recall, and F1 score
knn_lm_accuracy = accuracy_score(y_knn_test, knn_lm_pred)
knn_lm_precision = precision_score(y_knn_test, knn_lm_pred)
knn_lm_recall = recall_score(y_knn_test, knn_lm_pred)
knn_lm_f1 = 2 * (knn_lm_precision * knn_lm_recall) / (knn_lm_precision + knn_lm_recall)

# Calculate AUC score
knn_lm_probs = knn_lm.predict_proba(X_knn_test)
knn_lm_probs = knn_lm_probs[:,1]
knn_lm_auc = roc_auc_score(y_knn_test, knn_lm_probs)

# Display the metrics
print("Logistic Regression: KNN Imputation")
print(" - Accuracy : ",'{:.3f}'.format(knn_lm_accuracy))
print(" - Recall   : ",'{:.3f}'.format(knn_lm_recall))
print(" - F1 score : ",'{:.3f}'.format(knn_lm_f1))
print(" - AUC score: ",'{:.3f}'.format(knn_lm_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_knn_test,knn_lm_pred))

### 3.1.2. Random Forest: KNN Imputation - Imbalanced Data

In [None]:
# Initiate the model
knn_rf = RandomForestClassifier()

# Fit the model
knn_rf.fit(X_knn_train, y_knn_train.ravel())

# Make Predictions
knn_rf_pred = knn_rf.predict(X_knn_test)

# Calculate Accuracy, Precision, Recall, and F1 score
knn_rf_accuracy = accuracy_score(y_knn_test, knn_rf_pred)
knn_rf_precision = precision_score(y_knn_test, knn_rf_pred)
knn_rf_recall = recall_score(y_knn_test, knn_rf_pred)
knn_rf_f1 = 2 * (knn_rf_precision * knn_rf_recall) / (knn_rf_precision + knn_rf_recall)

# Calculate AUC score
knn_rf_probs = knn_rf.predict_proba(X_knn_test)
knn_rf_probs = knn_rf_probs[:,1]
knn_rf_auc = roc_auc_score(y_knn_test, knn_rf_probs)

# Display the metrics
print("Random Forest: KNN Imputation")
print(" - Accuracy : ",'{:.3f}'.format(knn_rf_accuracy))
print(" - Recall   : ",'{:.3f}'.format(knn_rf_recall))
print(" - F1 score : ",'{:.3f}'.format(knn_rf_f1))
print(" - AUC score: ",'{:.3f}'.format(knn_rf_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_knn_test,knn_rf_pred))

### 3.1.3. XGBoost: KNN Imputation - Imbalanced Data

In [None]:
# Initiate the model
knn_xgb = XGBClassifier()

# Fit the model
knn_xgb.fit(X_knn_train, y_knn_train.ravel())

# Make Predictions
knn_xgb_pred = knn_xgb.predict(X_knn_test)

# Calculate Accuracy, Precision, Recall, and F1 score
knn_xgb_accuracy = accuracy_score(y_knn_test, knn_xgb_pred)
knn_xgb_precision = precision_score(y_knn_test, knn_xgb_pred)
knn_xgb_recall = recall_score(y_knn_test, knn_xgb_pred)
knn_xgb_f1 = 2 * (knn_xgb_precision * knn_xgb_recall) / (knn_xgb_precision + knn_xgb_recall)

# Calculate AUC score
knn_xgb_probs = knn_xgb.predict_proba(X_knn_test)
knn_xgb_probs = knn_xgb_probs[:,1]
knn_xgb_auc = roc_auc_score(y_knn_test, knn_xgb_probs)

# Display the metrics
print("XGBoost: Imputation with Mode")
print(" - Accuracy : ",'{:.3f}'.format(knn_xgb_accuracy))
print(" - Recall   : ",'{:.3f}'.format(knn_xgb_recall))
print(" - F1 score : ",'{:.3f}'.format(knn_xgb_f1))
print(" - AUC score: ",'{:.3f}'.format(knn_xgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_knn_test,knn_xgb_pred))

## 3.2. Prediction with KNN Imputation - SMOTE

### Apply SMOTE

In [None]:
# Since this is an imbalanced data, apply SMOTE to the training set
smote=SMOTE()
X_smote_knn_train, y_smote_knn_train = smote.fit_sample(X_knn_train,y_knn_train)

# Check if SMOTE were properly applied
y_smote_knn_train.value_counts()

### 3.2.1. Logistic Regression: KNN Imputation - SMOTE

In [None]:
# Initiate the model
smote_knn_lm = LogisticRegression()

# Fit the model
smote_knn_lm.fit(X_smote_knn_train, y_smote_knn_train.ravel())

# Make Predictions
smote_knn_lm_pred = smote_knn_lm.predict(X_knn_test)

# Calculate Accuracy, Precision, Recall, and F1 score
smote_knn_lm_accuracy = accuracy_score(y_knn_test, smote_knn_lm_pred)
smote_knn_lm_precision = precision_score(y_knn_test, smote_knn_lm_pred)
smote_knn_lm_recall = recall_score(y_knn_test, smote_knn_lm_pred)
smote_knn_lm_f1 = 2 * (smote_knn_lm_precision * smote_knn_lm_recall) / (smote_knn_lm_precision + smote_knn_lm_recall)

# Calculate AUC score
smote_knn_lm_probs = smote_knn_lm.predict_proba(X_knn_test)
smote_knn_lm_probs = smote_knn_lm_probs[:,1]
smote_knn_lm_auc = roc_auc_score(y_knn_test, smote_knn_lm_probs)

# Display the metrics
print("Logistic Regression: Imputation with KNN: SMOTE")
print(" - Accuracy : ",'{:.3f}'.format(smote_knn_lm_accuracy))
print(" - Recall   : ",'{:.3f}'.format(smote_knn_lm_recall))
print(" - F1 score : ",'{:.3f}'.format(smote_knn_lm_f1))
print(" - AUC score: ",'{:.3f}'.format(smote_knn_lm_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_knn_test,smote_knn_lm_pred))

### 3.2.2. Random Forest: KNN Imputation - SMOTE

In [None]:
# Initiate the model
smote_knn_rf = RandomForestClassifier()

# Fit the model
smote_knn_rf.fit(X_smote_knn_train, y_smote_knn_train.ravel())

# Make Predictions
smote_knn_rf_pred = smote_knn_rf.predict(X_knn_test)

# Calculate Accuracy, Precision, Recall, and F1 score
smote_knn_rf_accuracy = accuracy_score(y_knn_test, smote_knn_rf_pred)
smote_knn_rf_precision = precision_score(y_knn_test, smote_knn_rf_pred)
smote_knn_rf_recall = recall_score(y_knn_test, smote_knn_rf_pred)
smote_knn_rf_f1 = 2 * (smote_knn_rf_precision * smote_knn_rf_recall) / (smote_knn_rf_precision + smote_knn_rf_recall)

# Calculate AUC score
smote_knn_rf_probs = smote_knn_rf.predict_proba(X_knn_test)
smote_knn_rf_probs = smote_knn_rf_probs[:,1]
smote_knn_rf_auc = roc_auc_score(y_knn_test, smote_knn_rf_probs)

# Display the metrics
print("Random Forest: Imputation with Mode: SMOTE")
print(" - Accuracy : ",'{:.3f}'.format(smote_knn_rf_accuracy))
print(" - Recall   : ",'{:.3f}'.format(smote_knn_rf_recall))
print(" - F1 score : ",'{:.3f}'.format(smote_knn_rf_f1))
print(" - AUC score: ",'{:.3f}'.format(smote_knn_rf_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_knn_test,smote_knn_rf_pred))

### 3.2.3. XGBoost: KNN Imputation - SMOTE

In [None]:
# Initiate the model
smote_knn_xgb = XGBClassifier()

# Fit the model
smote_knn_xgb.fit(X_smote_knn_train, y_smote_knn_train.ravel())

# Make Predictions
smote_knn_xgb_pred = smote_knn_xgb.predict(X_knn_test)

# Calculate Accuracy, Precision, Recall, and F1 score
smote_knn_xgb_accuracy = accuracy_score(y_knn_test, smote_knn_xgb_pred)
smote_knn_xgb_precision = precision_score(y_knn_test, smote_knn_xgb_pred)
smote_knn_xgb_recall = recall_score(y_knn_test, smote_knn_xgb_pred)
smote_knn_xgb_f1 = 2 * (smote_knn_xgb_precision * smote_knn_xgb_recall) / (smote_knn_xgb_precision + smote_knn_xgb_recall)

# Calculate AUC score
smote_knn_xgb_probs = smote_knn_xgb.predict_proba(X_knn_test)
smote_knn_xgb_probs = smote_knn_xgb_probs[:,1]
smote_knn_xgb_auc = roc_auc_score(y_knn_test, smote_knn_xgb_probs)

# Display the metrics
print("XGBoost: Imputation with Mode: SMOTE")
print(" - Accuracy : ",'{:.3f}'.format(smote_knn_xgb_accuracy))
print(" - Recall   : ",'{:.3f}'.format(smote_knn_xgb_recall))
print(" - F1 score : ",'{:.3f}'.format(smote_knn_xgb_f1))
print(" - AUC score: ",'{:.3f}'.format(smote_knn_xgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_knn_test,smote_knn_xgb_pred))

# 4. Data without Dropping and Imputation

In [None]:
# Separate the dataset into features and target
X = df_train.drop(['enrollee_id','city','target'],axis=1)
y = df_train['target']

In [None]:
# Create dummies for nominal variables
# Set dummy_na=True to include NaN as a dummy variable 
X = pd.concat([X,pd.get_dummies(X["gender"], prefix="gender", dummy_na=True)], 
                   axis=1).drop(columns=["gender"])
X = pd.concat([X,pd.get_dummies(X["enrolled_university"], prefix="enrolled_university",  dummy_na=True)],
                   axis=1).drop(columns=["enrolled_university"])
X = pd.concat([X,pd.get_dummies(X["major_discipline"], prefix="major_discipline",  dummy_na=True)], 
                   axis=1).drop(columns=["major_discipline"])
X = pd.concat([X,pd.get_dummies(X["company_type"], prefix="company_type",  dummy_na=True)], 
                   axis=1).drop(columns=["company_type"])

In [None]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30,stratify=y, random_state=101)

## 4.1. Prediction with Data without Imputation  - Imbalanced Data

### 4.1.1. XGBoost Classifier with Data without Imputation: Imbalanced Data

In [None]:
# Initiate the model
base_xgb = XGBClassifier()
# Fit the model
base_xgb_model = base_xgb.fit(X_train, y_train.ravel())
# Make Predictions
base_xgb_pred=base_xgb_model.predict(X_test)

# Calculate Accuracy, Precision, Recall, and F1 score
base_xgb_accuracy = accuracy_score(y_test, base_xgb_pred)
base_xgb_precision = precision_score(y_test, base_xgb_pred)
base_xgb_recall = recall_score(y_test, base_xgb_pred)
base_xgb_f1 = 2 * (base_xgb_precision * base_xgb_recall) / (base_xgb_precision + base_xgb_recall)

# Calculate AUC score
base_xgb_probs = base_xgb.predict_proba(X_test)
base_xgb_probs = base_xgb_probs[:,1]
base_xgb_auc = roc_auc_score(y_test, base_xgb_probs)

# Display the metrics
print("XGBClassifier: Imbalanced Data")
print(" - Accuracy : ",'{:.3f}'.format(base_xgb_accuracy))
print(" - Recall   : ",'{:.3f}'.format(base_xgb_recall))
print(" - F1 score : ",'{:.3f}'.format(base_xgb_f1))
print(" - AUC score: ",'{:.3f}'.format(base_xgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,base_xgb_pred))

### 4.1.2. LightGBM Classifier with Data without Imputation: Imbalanced Data

In [None]:
# import LightGBM
import lightgbm as lgb

In [None]:
# Initiate the model
base_lgb = lgb.LGBMClassifier()
# Fit the model
base_lgb_model = base_lgb.fit(X_train, y_train.ravel())
# Make Predictions
base_lgb_pred=base_lgb_model.predict(X_test)

# Calculate Accuracy, Precision, Recall, and F1 score
base_lgb_accuracy = accuracy_score(y_test, base_lgb_pred)
base_lgb_precision = precision_score(y_test, base_lgb_pred)
base_lgb_recall = recall_score(y_test, base_lgb_pred)
base_lgb_f1 = 2 * (base_lgb_precision * base_lgb_recall) / (base_lgb_precision + base_lgb_recall)

# Calculate AUC score
base_lgb_probs = base_lgb.predict_proba(X_test)
base_lgb_probs = base_lgb_probs[:,1]
base_lgb_auc = roc_auc_score(y_test, base_lgb_probs)

# Display the metrics
print("LightGBM Classifier: Imbalanced Data")
print(" - Accuracy : ",'{:.3f}'.format(base_lgb_accuracy))
print(" - Recall   : ",'{:.3f}'.format(base_lgb_recall))
print(" - F1 score : ",'{:.3f}'.format(base_lgb_f1))
print(" - AUC score: ",'{:.3f}'.format(base_lgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,base_lgb_pred))

## 4.2. Prediction with Data without Imputation - Class Weighted

The dataset is imbalanced. So rebalance the data using scale_pos_weight.

### Rebalance Data by Weighting Classes

In [None]:
# Count the target
y.value_counts()

In [None]:
# T - no. of total samples
# P - no. of positive samples
# scale_pos_weight = percent of negative / percent of positive
# which translates to:
# scale_pos_weight = (100*(T-P)/T) / (100*P/T)
# which further simplifies to beautiful:
# scale_pos_weight = T/P - 1
T = 14381 + 4777
P = 4777
scale_pos_weight = T/P - 1
scale_pos_weight

### 4.2.1. XGB Classifier with Data without Imputation: Weighted

In [None]:
# Initiate the model
weighted_xgb = XGBClassifier(scale_pos_weight=scale_pos_weight)
# Fit the model
weighted_xgb_model = weighted_xgb.fit(X_train, y_train.ravel())
# Make Predictions
weighted_xgb_pred=weighted_xgb_model.predict(X_test)

# Calculate Accuracy, Precision, Recall, and F1 score
weighted_xgb_accuracy = accuracy_score(y_test, weighted_xgb_pred)
weighted_xgb_precision = precision_score(y_test, weighted_xgb_pred)
weighted_xgb_recall = recall_score(y_test, weighted_xgb_pred)
weighted_xgb_f1 = 2 * (weighted_xgb_precision * weighted_xgb_recall) / (weighted_xgb_precision + weighted_xgb_recall)

# Calculate AUC score
weighted_xgb_probs = weighted_xgb.predict_proba(X_test)
weighted_xgb_probs = weighted_xgb_probs[:,1]
weighted_xgb_auc = roc_auc_score(y_test, weighted_xgb_probs)

# Display the metrics
print("XGBClassifier: Weighted")
print(" - Accuracy : ",'{:.3f}'.format(weighted_xgb_accuracy))
print(" - Recall   : ",'{:.3f}'.format(weighted_xgb_recall))
print(" - F1 score : ",'{:.3f}'.format(weighted_xgb_f1))
print(" - AUC score: ",'{:.3f}'.format(weighted_xgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,weighted_xgb_pred))

### 4.2.2. LightGBM Classifier with Data without Imputation: Weighted

In [None]:
# Initiate the model
weighted_lgb = lgb.LGBMClassifier(scale_pos_weight=scale_pos_weight)
# Fit the model
weighted_lgb_model = weighted_lgb.fit(X_train, y_train.ravel())
# Make Predictions
weighted_lgb_pred=weighted_lgb_model.predict(X_test)

# Calculate Accuracy, Precision, Recall, and F1 score
weighted_lgb_accuracy = accuracy_score(y_test, weighted_lgb_pred)
weighted_lgb_precision = precision_score(y_test, weighted_lgb_pred)
weighted_lgb_recall = recall_score(y_test, weighted_lgb_pred)
weighted_lgb_f1 = 2 * (weighted_lgb_precision * weighted_lgb_recall) / (weighted_lgb_precision + weighted_lgb_recall)

# Calculate AUC score
weighted_lgb_probs = weighted_lgb.predict_proba(X_test)
weighted_lgb_probs = weighted_lgb_probs[:,1]
weighted_lgb_auc = roc_auc_score(y_test, weighted_lgb_probs)

# Display the metrics
print("LightGBM Classifier: Weighted")
print(" - Accuracy : ",'{:.3f}'.format(weighted_lgb_accuracy))
print(" - Recall   : ",'{:.3f}'.format(weighted_lgb_recall))
print(" - F1 score : ",'{:.3f}'.format(weighted_lgb_f1))
print(" - AUC score: ",'{:.3f}'.format(weighted_lgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,weighted_lgb_pred))

- LightGBClassifier with scale pos weight achieved the highest performance.
- So let's tune parameters of this model.

## 5. Summary of Results

Let's summarize the results:
  - For data: 1) Mode imputation, 2) KNN imputation, 3) Without imputation
  - For P/N balance: SMOTE rebalanced or Positive weighted
  - For algorithm: Logistic regression, Random forest, XGBoost, LightGBM

In [None]:
# Summarize results with Tables 
pd.options.display.float_format = '{:.3f}'.format

pf_list = [
    [smote_imp_lm_recall, smote_imp_lm_f1, smote_imp_lm_auc],
    [smote_imp_rf_recall, smote_imp_rf_f1, smote_imp_rf_auc],
    [smote_imp_xgb_recall, smote_imp_xgb_f1, smote_imp_xgb_auc],
    [smote_knn_lm_recall, smote_knn_lm_f1, smote_knn_lm_auc],
    [smote_knn_rf_recall, smote_knn_rf_f1, smote_knn_rf_auc],
    [smote_knn_xgb_recall, smote_knn_xgb_f1, smote_knn_xgb_auc],
    [weighted_xgb_recall, weighted_xgb_f1, weighted_xgb_auc],
    [weighted_lgb_recall, weighted_lgb_f1, weighted_lgb_auc]]

pf_df = pd.DataFrame(pf_list)
pf_df.index = ['Logistic: Mode Imputation','Random Forest: Mode Imputation','XGBoost: Mode Imputation',
               'Logistic: KNN Imputation','Random Forest: KNN Imputation','XGBoost: KNN Imputation',
              'XGBoost: No Imputation', 'LightGBM: No Imputation']
pf_df.columns = ['Recall', 'F1','AUC']

pf_df

- Weighted LightGBM without imputation achieve the best performance.
- So let's tune its hyperparameter to maximize AUC.

## 6. Gridsearch for LightGBM

In [None]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Set parameters
parameters = [{
    'learning_rate':[0.01, 0.1,0.2],
    'n_estimators':[10, 20,30,40],
    'max_depth':[3,5,7,9,10],
    'verbose':[-1],
    'min_data_in_leaf':[30,40,50,60,70],
    'num_leaves':[10, 20, 30]
}]

# Grid Search: Maximize AUC score
classifier = GridSearchCV(lgb.LGBMClassifier(scale_pos_weight=scale_pos_weight), parameters, scoring='roc_auc', cv=3, n_jobs=-1)
classifier.fit(X_train, y_train)
print("Accuracy score (train): ", classifier.score(X_train, y_train))
print("Accuracy score (test): ", classifier.score(X_test, y_test))
print(classifier.best_estimator_) # Best parameter

In [None]:
# Initiate the model
best_lgb = lgb.LGBMClassifier(learning_rate=0.2, max_depth=7, min_data_in_leaf=30, n_estimators=40, num_leaves=10,verbose=-1,
                              scale_pos_weight=scale_pos_weight)
# Fit the model
best_lgb_model = best_lgb.fit(X_train, y_train.ravel())
# Make Predictions
best_lgb_pred=best_lgb_model.predict(X_test)

# Calculate Accuracy, Precision, Recall, and F1 score
best_lgb_accuracy = accuracy_score(y_test, best_lgb_pred)
best_lgb_precision = precision_score(y_test, best_lgb_pred)
best_lgb_recall = recall_score(y_test, best_lgb_pred)
best_lgb_f1 = 2 * (best_lgb_precision * best_lgb_recall) / (best_lgb_precision + best_lgb_recall)

# Calculate AUC score
best_lgb_probs = best_lgb.predict_proba(X_test)
best_lgb_probs = best_lgb_probs[:,1]
best_lgb_auc = roc_auc_score(y_test, best_lgb_probs)

# Display the metrics
print("LightGBM Classifier: Imbalanced Data")
print(" - Accuracy : ",'{:.3f}'.format(best_lgb_accuracy))
print(" - Precision: ",'{:.3f}'.format(best_lgb_precision))
print(" - Recall   : ",'{:.3f}'.format(best_lgb_recall))
print(" - F1 score : ",'{:.3f}'.format(best_lgb_f1))
print(" - AUC score: ",'{:.3f}'.format(best_lgb_auc))

# Display the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,best_lgb_pred))

#### The AUC score slightly improved.