In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing all libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
import six
import sys
sys.modules['sklearn.externals.six'] = six
from imblearn.over_sampling import SVMSMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.inspection import permutation_importance
import eli5
from eli5.sklearn import PermutationImportance

Importing train data

In [None]:
missing_values = ["n/a", "na", "--", "NONE", "None", "none", "NA", "N/A",'inf','-inf', '?', 'Null', 'NULL']
train_data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv', na_values = missing_values)
train_data.drop(['enrollee_id', 'city'], 1, inplace=True)
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.info()

In the dataset, there is some Human error in column company size i.e. Oct-49 and in pandas it was printed as 10/49, so we need to convert into np.nan(NaN)

In [None]:
print(train_data.company_size.value_counts())
train_data['company_size'] = train_data['company_size'].replace('10/49', np.nan)
print("==============================")
print(train_data.company_size.value_counts())

Just checking total unique values in every column

In [None]:
for col_name in train_data.columns:
  if (train_data[col_name].dtypes == 'int64' or train_data[col_name].dtypes == 'float64' or train_data[col_name].dtypes == 'object'):
    unique_cat = len(train_data[col_name].unique())
    print("Feature '{col_name}' has '{unique_cat}' unique categories".format(col_name = col_name, unique_cat = unique_cat))

In [None]:
train_data.isnull().sum()

In [None]:
to_LabelEncode = train_data[['gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job']]

le = LabelEncoder()
train_temp = to_LabelEncode.astype("str").apply(le.fit_transform)
train_final = train_temp.where(~to_LabelEncode.isna(), to_LabelEncode)

In [None]:
train_data.drop(['gender', 'relevent_experience','enrolled_university', 'education_level', 'major_discipline','experience', 'company_size', 'company_type', 'last_new_job'],1,inplace=True)

In [None]:
train_data = train_final.join(train_data)
train_data

MICE (Multiple Imputation by Chained Equations) Imputation. Its a multiple imputation method, it is generally better than  single imputation method like mean imputation.

In [None]:
lr = LinearRegression()
mice_imputer = IterativeImputer(random_state=42, estimator=lr, max_iter=10, n_nearest_features=2, imputation_order = 'roman')
train_final_df = mice_imputer.fit_transform(train_data)

train_final_df = pd.DataFrame(train_final_df)
train_final_df.columns = ['gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline',
                                                         'experience', 'company_size', 'company_type', 'last_new_job', 'city_development_index', 'training_hours', 'target']
                                                        
train_final_df

Now we dont have any null values

In [None]:
final_train = train_final_df.copy()
final_train.isnull().sum()

Heavy class imbalance is present in the data

In [None]:
final_train.target.value_counts()

Splitting into X and y and than standardizing it using Standard Scaler

In [None]:
X = final_train.drop('target',1)
y = final_train.target

X_train,X_test,y_train,y_test = tts(X,y,test_size=0.25, random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Applied SVMSmote, I also applied different variants of Smote like SMOTE, SMOTE-NC, KmeansSMOTE, AdasysMOTE, BorderlineSMOTE. KmeansSMOTE also gave me very good result but due some compatibility issue of kmeansSmote which uses sklearn version 0.20 only and MICE imputation required newer version of sklearn version, so I switched kmeans smote to SVMSmote

In [None]:
svm_smote = SVMSMOTE(sampling_strategy='minority', random_state=42, k_neighbors=5)
X_svm_smote, y_svm_smote = svm_smote.fit_resample(X,y)

X_train_svm, X_test_svm, y_train_svm, y_test_svm = tts(X_svm_smote,y_svm_smote, test_size=0.25, random_state=42)

sc = StandardScaler()
X_train_svm = sc.fit_transform(X_train_svm)
X_test_svm = sc.transform(X_test_svm)

In [None]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    errors = abs(y_pred - y_test)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y_test,y_pred))
    print(confusion_matrix(y_test,y_pred))
    print('Recall Score = ',recall_score(y_test, y_pred))
    print('Precision Score = ',precision_score(y_test, y_pred))
    print('F1 score = ', f1_score(y_test,y_pred))

    return evaluate

In [None]:
def train_auc_roc_curve(model, X_test, y_test, X_train, y_train):
  base_fpr,base_tpr,base_threshold = roc_curve(y_train, model.predict(X_train))
  plt.plot([0,1])
  plt.plot(base_fpr,base_tpr)
  print("auc score :",auc(base_fpr,base_tpr))
  

  return train_auc_roc_curve

Applied EasyEnsembleClassifier of imblearn

In [None]:
easy_lgbm = EasyEnsembleClassifier(base_estimator= LGBMClassifier(random_state=42), n_estimators=250, n_jobs=1,
                       random_state=42, replacement=True,
                       sampling_strategy='auto', verbose=0,
                       warm_start=True)
easy_lgbm.fit(X_train_svm, y_train_svm)
evaluate(easy_lgbm, X_test_svm, y_test_svm)

In [None]:
print(classification_report(y_train_svm,easy_lgbm.predict(X_train_svm)))
print(confusion_matrix(y_train_svm,easy_lgbm.predict(X_train_svm)))
print('Recall Score = ',recall_score(y_train_svm,easy_lgbm.predict(X_train_svm)))
print('Precision Score = ',precision_score(y_train_svm,easy_lgbm.predict(X_train_svm)))

Predicting the f1 score of both train and test(validation) and printing the probablity of prediction

In [None]:
print(f1_score(y_train_svm, easy_lgbm.predict(X_train_svm)))
print(f1_score(y_test_svm, easy_lgbm.predict(X_test_svm)))

predict_proba_easy_lgbm = pd.DataFrame(easy_lgbm.predict_proba(X_test_svm))
predict_proba_easy_lgbm

I used eli5 library to find out feature importance.

In [None]:
eli5_permutation = PermutationImportance(estimator = easy_lgbm, scoring = 'f1', random_state=42, n_iter = 5)
eli5_permutation.fit(X_test_svm, y_test_svm)

In [None]:
eli5_permutation.feature_importances_.T.reshape(-1,1)

This shows the lowest and highest importance of every feature

In [None]:
eli5.show_weights(eli5_permutation, feature_names = X.columns.to_list())

In [None]:
feature_importance_with_eli5=pd.DataFrame(np.hstack((np.array([X.columns[0:]]).T, eli5_permutation.feature_importances_.T.reshape(-1,1))), columns=['feature', 'importance'])
feature_importance_with_eli5['importance']=pd.to_numeric(feature_importance_with_eli5['importance'])
feature_importance_with_eli5.sort_values(by='importance', ascending=False)

Gender is most important factor to understand whether he or she will change the job or not, followed by City Development Index and Company Type

In [None]:
plt.figure(figsize = (15,8))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
# We sort by importance and get the features
sns.barplot(x = 'importance', y = 'feature', data = feature_importance_with_eli5, 
            order = feature_importance_with_eli5.sort_values('importance', ascending=False).feature) 

This is a very good score

In [None]:
train_auc_roc_curve(easy_lgbm, X_test_svm, y_test_svm, X_train_svm, y_train_svm)

Test Data

In [None]:
missing_values = ["n/a", "na", "--", "NONE", "None", "none", "NA", "N/A",'inf','-inf', '?', 'Null', 'NULL']
test_data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv', na_values= missing_values)
test_data.drop(['enrollee_id', 'city'], 1, inplace=True)
test_data.head()

In [None]:
test_data.isnull().sum()

In [None]:
test_data['company_size'] = test_data['company_size'].replace('10/49', np.nan)
test_data['company_size'].value_counts()

In [None]:
to_LabelEncode_test = test_data[['gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job']]

test_temp = to_LabelEncode_test.astype("str").apply(le.fit_transform)
test_final = test_temp.where(~to_LabelEncode_test.isna(), to_LabelEncode_test)

In [None]:
test_data.drop(['gender', 'relevent_experience','enrolled_university', 'education_level', 'major_discipline','experience', 'company_size', 'company_type', 'last_new_job'],1,inplace=True)

In [None]:
test_data = test_final.join(test_data)

In [None]:
test_final_df = mice_imputer.fit_transform(test_data)

test_final_df = pd.DataFrame(test_final_df)
test_final_df.columns = ['gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline',
                                                         'experience', 'company_size', 'company_type', 'last_new_job', 'city_development_index', 'training_hours']
                                                        
test_final_df

In [None]:
test_final_df = sc.transform(test_final_df)

In [None]:
prediction = pd.DataFrame(easy_lgbm.predict(test_final_df))
prediction.value_counts()

In [None]:
threshold = 0.5
my_pred = np.where(prediction>threshold,'Will join the company','Will not join the company')

my_pred = my_pred.T.reshape(-1,1)
my_pred = pd.DataFrame(my_pred, columns=['Decision'])
my_pred

In [None]:
my_pred = my_pred.join(pd.DataFrame(easy_lgbm.predict_proba(test_final_df)), lsuffix='_right', rsuffix='_left')
my_pred = my_pred.rename({0 : 'Probablity of not joining', 1 : 'Probablity of joining'}, axis=1)
my_pred

That's it, the project is completed.

What I have done:
1. Loaded Libraries and train data
2. Deleted the unwanted columns.
3. Cleaned some Human Error
4. Label Encoded the data
5. Missing value Imputation via MICE technique
6. Checked for Class Imbalance
7. Splitted data into X and y, Standardized it.
8. Applied SVMSmote and solved class imbalance issue.
9. Applied Easy Ensemble Classifier Model with base estimator as Default LGBMClassifier of Imblearn package
10. Checked the feature importance according to the model using eli5 library
11. Finalized the Easy Ensemble Classifier model with base estimator as Default LGBMClassifier.
12. Predicted on Test Data. 