Importing all libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
import six
import sys
sys.modules['sklearn.externals.six'] = six
from imblearn.over_sampling import SVMSMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.inspection import permutation_importance
import eli5
from eli5.sklearn import PermutationImportance

Importing train data

In [2]:
missing_values = ["n/a", "na", "--", "NONE", "None", "none", "NA", "N/A",'inf','-inf', '?', 'Null', 'NULL']
# train_data = pd.read_csv('aug_train.csv', na_values = missing_values)
train_data = pd.read_csv('data/aug_train.csv')
train_data.drop(['enrollee_id', 'city'], 1, inplace=True)
train_data.head(10)

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
5,0.764,,Has relevent experience,Part time course,Graduate,STEM,11,,,1,24,1.0
6,0.92,Male,Has relevent experience,no_enrollment,High School,,5,50-99,Funded Startup,1,24,0.0
7,0.762,Male,Has relevent experience,no_enrollment,Graduate,STEM,13,<10,Pvt Ltd,>4,18,1.0
8,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,7,50-99,Pvt Ltd,1,46,1.0
9,0.92,,Has relevent experience,no_enrollment,Graduate,STEM,17,10000+,Pvt Ltd,>4,123,0.0


In [3]:
train_data.shape

(19158, 12)

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city_development_index  19158 non-null  float64
 1   gender                  14650 non-null  object 
 2   relevent_experience     19158 non-null  object 
 3   enrolled_university     18772 non-null  object 
 4   education_level         18698 non-null  object 
 5   major_discipline        16345 non-null  object 
 6   experience              19093 non-null  object 
 7   company_size            13220 non-null  object 
 8   company_type            13018 non-null  object 
 9   last_new_job            18735 non-null  object 
 10  training_hours          19158 non-null  int64  
 11  target                  19158 non-null  float64
dtypes: float64(2), int64(1), object(9)
memory usage: 1.8+ MB


In the dataset, there is some Human error in column company size i.e. Oct-49 and in pandas it was printed as 10/49, so we need to convert into np.nan(NaN) 

In [5]:
# print(train_data.company_size.value_counts())
# train_data['company_size'] = train_data['company_size'].replace('10/49', np.nan)
# print("==============================")
print(train_data.company_size.value_counts())

50-99        3083
100-500      2571
10000+       2019
10/49        1471
1000-4999    1328
<10          1308
500-999       877
5000-9999     563
Name: company_size, dtype: int64


Just checking total unique values in every column

In [6]:
for col_name in train_data.columns:
  if (train_data[col_name].dtypes == 'int64' or train_data[col_name].dtypes == 'float64' or train_data[col_name].dtypes == 'object'):
    unique_cat = len(train_data[col_name].unique())
    print("Feature '{col_name}' has '{unique_cat}' unique categories".format(col_name = col_name, unique_cat = unique_cat))

Feature 'city_development_index' has '93' unique categories
Feature 'gender' has '4' unique categories
Feature 'relevent_experience' has '2' unique categories
Feature 'enrolled_university' has '4' unique categories
Feature 'education_level' has '6' unique categories
Feature 'major_discipline' has '7' unique categories
Feature 'experience' has '23' unique categories
Feature 'company_size' has '9' unique categories
Feature 'company_type' has '7' unique categories
Feature 'last_new_job' has '7' unique categories
Feature 'training_hours' has '241' unique categories
Feature 'target' has '2' unique categories


In [7]:
train_data.isnull().sum()

city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

In [8]:
to_LabelEncode = train_data[['gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job']]

le = LabelEncoder()
train_temp = to_LabelEncode.astype("str").apply(le.fit_transform)
train_final = train_temp.where(~to_LabelEncode.isna(), to_LabelEncode)

In [9]:
train_final

Unnamed: 0,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job
0,1,0,3,0,5,21,,,0
1,1,1,3,0,5,6,4,5,4
2,,1,0,0,5,15,,,6
3,,1,,0,1,20,,5,6
4,1,0,3,2,5,21,4,1,3
...,...,...,...,...,...,...,...,...,...
19153,1,1,3,0,2,5,,,0
19154,1,0,3,0,5,5,,,3
19155,1,0,3,0,5,21,4,5,3
19156,1,0,3,1,,20,5,5,1


In [10]:
train_data.drop(['gender', 'relevent_experience','enrolled_university', 'education_level', 'major_discipline','experience', 'company_size', 'company_type', 'last_new_job'],1,inplace=True)

In [11]:
train_data = train_final.join(train_data)

MICE (Multiple Imputation by Chained Equations) Imputation. Its a multiple imputation method, it is generally better than  single imputation method like mean imputation.

In [12]:
lr = LinearRegression()
mice_imputer = IterativeImputer(random_state=42, estimator=lr, max_iter=10, n_nearest_features=2, imputation_order = 'roman')
train_final_df = mice_imputer.fit_transform(train_data)

train_final_df = pd.DataFrame(train_final_df)
train_final_df.columns = ['gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline',
                                                         'experience', 'company_size', 'company_type', 'last_new_job', 'city_development_index', 'training_hours', 'target']
                                                        
train_final_df



Unnamed: 0,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,city_development_index,training_hours,target
0,1.000000,0.0,3.000000,0.0,5.000000,21.0,3.008503,4.070495,0.0,0.920,36.0,1.0
1,1.000000,1.0,3.000000,0.0,5.000000,6.0,4.000000,5.000000,4.0,0.776,47.0,0.0
2,0.953038,1.0,0.000000,0.0,5.000000,15.0,3.036662,4.713792,6.0,0.624,83.0,0.0
3,0.940145,1.0,1.907067,0.0,1.000000,20.0,2.923227,5.000000,6.0,0.789,52.0,1.0
4,1.000000,0.0,3.000000,2.0,5.000000,21.0,4.000000,1.000000,3.0,0.767,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19153,1.000000,1.0,3.000000,0.0,2.000000,5.0,3.019819,4.069892,0.0,0.878,42.0,1.0
19154,1.000000,0.0,3.000000,0.0,5.000000,5.0,2.978869,4.394476,3.0,0.920,52.0,1.0
19155,1.000000,0.0,3.000000,0.0,5.000000,21.0,4.000000,5.000000,3.0,0.920,44.0,0.0
19156,1.000000,0.0,3.000000,1.0,4.655379,20.0,5.000000,5.000000,1.0,0.802,97.0,0.0


Now we dont have any null values

In [13]:
final_train = train_final_df.copy()
# final_train.isnull().sum()

Heavy class imbalance is present in the data

In [14]:
final_train.target.value_counts()

0.0    14381
1.0     4777
Name: target, dtype: int64

Splitting into X and y and than standardizing it using Standard Scaler

In [15]:
X = final_train.drop('target',1)
y = final_train.target

X_train,X_test,y_train,y_test = tts(X,y,test_size=0.25, random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Applied SVMSmote, I also applied different variants of Smote like SMOTE, SMOTE-NC, KmeansSMOTE, AdasysMOTE, BorderlineSMOTE. KmeansSMOTE also gave me very good result but due some compatibility issue of kmeansSmote which uses sklearn version 0.20 only and MICE imputation required newer version of sklearn version, so I switched kmeans smote to SVMSmote

In [16]:
svm_smote = SVMSMOTE(sampling_strategy='minority', random_state=42, k_neighbors=5)
X_svm_smote, y_svm_smote = svm_smote.fit_resample(X,y)

X_train_svm, X_test_svm, y_train_svm, y_test_svm = tts(X_svm_smote,y_svm_smote, test_size=0.25, random_state=42)

sc = StandardScaler()
X_train_svm = sc.fit_transform(X_train_svm)
X_test_svm = sc.transform(X_test_svm)

In [17]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    errors = abs(y_pred - y_test)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y_test,y_pred))
    print(confusion_matrix(y_test,y_pred))
    print('Recall Score = ',recall_score(y_test, y_pred))
    print('Precision Score = ',precision_score(y_test, y_pred))
    print('F1 score = ', f1_score(y_test,y_pred))

    return evaluate

In [18]:
def train_auc_roc_curve(model, X_test, y_test, X_train, y_train):
  base_fpr,base_tpr,base_threshold = roc_curve(y_train, model.predict(X_train))
  plt.plot([0,1])
  plt.plot(base_fpr,base_tpr)
  print("auc score :",auc(base_fpr,base_tpr))
  

  return train_auc_roc_curve

Applied EasyEnsembleClassifier of imblearn

In [19]:
easy_lgbm = EasyEnsembleClassifier(base_estimator= LGBMClassifier(random_state=42), n_estimators=250, n_jobs=1,
                       random_state=42, replacement=True,
                       sampling_strategy='auto', verbose=0,
                       warm_start=True)
easy_lgbm.fit(X_train_svm, y_train_svm)
evaluate(easy_lgbm, X_train_svm, y_train_svm)

Average Error: 0.0443 degrees.
              precision    recall  f1-score   support

         0.0       0.94      0.97      0.96     10778
         1.0       0.97      0.94      0.95     10793

    accuracy                           0.96     21571
   macro avg       0.96      0.96      0.96     21571
weighted avg       0.96      0.96      0.96     21571

[[10499   279]
 [  677 10116]]
Recall Score =  0.9372741591772445
Precision Score =  0.9731601731601731
F1 score =  0.9548801208231075


<function __main__.evaluate(model, X_test, y_test)>

In [20]:
evaluate(easy_lgbm, X_test_svm, y_test_svm)

Average Error: 0.0560 degrees.
              precision    recall  f1-score   support

         0.0       0.93      0.96      0.95      3603
         1.0       0.96      0.92      0.94      3588

    accuracy                           0.94      7191
   macro avg       0.94      0.94      0.94      7191
weighted avg       0.94      0.94      0.94      7191

[[3472  131]
 [ 272 3316]]
Recall Score =  0.9241917502787068
Precision Score =  0.961995938497244
F1 score =  0.9427149964463397


<function __main__.evaluate(model, X_test, y_test)>

# TEST ON REAL DATA SET

In [21]:
evaluate(easy_lgbm, X_train, y_train)

Average Error: 0.7485 degrees.
              precision    recall  f1-score   support

         0.0       1.00      0.00      0.01     10797
         1.0       0.25      1.00      0.40      3571

    accuracy                           0.25     14368
   macro avg       0.62      0.50      0.20     14368
weighted avg       0.81      0.25      0.11     14368

[[   43 10754]
 [    0  3571]]
Recall Score =  1.0
Precision Score =  0.2492844677137871
F1 score =  0.39908359409924005


<function __main__.evaluate(model, X_test, y_test)>

In [22]:
evaluate(easy_lgbm, X_test, y_test)

Average Error: 0.7443 degrees.
              precision    recall  f1-score   support

         0.0       1.00      0.01      0.01      3584
         1.0       0.25      1.00      0.40      1206

    accuracy                           0.26      4790
   macro avg       0.63      0.50      0.21      4790
weighted avg       0.81      0.26      0.11      4790

[[  19 3565]
 [   0 1206]]
Recall Score =  1.0
Precision Score =  0.2527771955564871
F1 score =  0.4035469298979421


<function __main__.evaluate(model, X_test, y_test)>

Predicting the f1 score of both train and test(validation) and printing the probablity of prediction

In [23]:
# print(f1_score(y_train_svm, easy_lgbm.predict(X_train_svm)))
# print(f1_score(y_test_svm, easy_lgbm.predict(X_test_svm)))

# predict_proba_easy_lgbm = pd.DataFrame(easy_lgbm.predict_proba(X_test_svm))
# predict_proba_easy_lgbm

print(f1_score(y_train, easy_lgbm.predict(X_train)))
print(f1_score(y_test, easy_lgbm.predict(X_test)))

0.39908359409924005
0.4035469298979421


I used eli5 library to find out feature importance.

In [24]:
# eli5_permutation = PermutationImportance(estimator = easy_lgbm, scoring = 'f1', random_state=42, n_iter = 5)
# eli5_permutation.fit(X_test_svm, y_test_svm)

In [25]:
# eli5_permutation.feature_importances_.T.reshape(-1,1)

This shows the lowest and highest importance of every feature

In [26]:
# eli5.show_weights(eli5_permutation, feature_names = X.columns.to_list())

In [27]:
# feature_importance_with_eli5=pd.DataFrame(np.hstack((np.array([X.columns[0:]]).T, eli5_permutation.feature_importances_.T.reshape(-1,1))), columns=['feature', 'importance'])
# feature_importance_with_eli5['importance']=pd.to_numeric(feature_importance_with_eli5['importance'])
# feature_importance_with_eli5.sort_values(by='importance', ascending=False)

Gender is most important factor to understand whether he or she will change the job or not, followed by City Development Index and Company Type

In [28]:
# plt.figure(figsize = (15,8))
# plt.xticks(fontsize=15)
# plt.yticks(fontsize=15)
# # We sort by importance and get the features
# sns.barplot(x = 'importance', y = 'feature', data = feature_importance_with_eli5, 
#             order = feature_importance_with_eli5.sort_values('importance', ascending=False).feature) 

This is a very good score

In [29]:
# train_auc_roc_curve(easy_lgbm, X_test_svm, y_test_svm, X_train_svm, y_train_svm)

Test Data

In [30]:
# missing_values = ["n/a", "na", "--", "NONE", "None", "none", "NA", "N/A",'inf','-inf', '?', 'Null', 'NULL']
# test_data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv', na_values= missing_values)
# test_data.drop(['enrollee_id', 'city'], 1, inplace=True)
# test_data.head()

In [31]:
# test_data.isnull().sum()

In [32]:
# test_data['company_size'] = test_data['company_size'].replace('10/49', np.nan)
# test_data['company_size'].value_counts()

In [33]:
# to_LabelEncode_test = test_data[['gender', 'relevent_experience',
#        'enrolled_university', 'education_level', 'major_discipline',
#        'experience', 'company_size', 'company_type', 'last_new_job']]

# test_temp = to_LabelEncode_test.astype("str").apply(le.fit_transform)
# test_final = test_temp.where(~to_LabelEncode_test.isna(), to_LabelEncode_test)

In [34]:
# test_data.drop(['gender', 'relevent_experience','enrolled_university', 'education_level', 'major_discipline','experience', 'company_size', 'company_type', 'last_new_job'],1,inplace=True)

In [35]:
# test_data = test_final.join(test_data)

In [36]:
# test_final_df = mice_imputer.fit_transform(test_data)

# test_final_df = pd.DataFrame(test_final_df)
# test_final_df.columns = ['gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline',
#                                                          'experience', 'company_size', 'company_type', 'last_new_job', 'city_development_index', 'training_hours']
                                                        
# test_final_df

In [37]:
# test_final_df = sc.transform(test_final_df)

In [38]:
# prediction = pd.DataFrame(easy_lgbm.predict(test_final_df))
# prediction.value_counts()

In [39]:
# threshold = 0.5
# my_pred = np.where(prediction>threshold,'Will join the company','Will not join the company')

# my_pred = my_pred.T.reshape(-1,1)
# my_pred = pd.DataFrame(my_pred, columns=['Decision'])
# my_pred

In [40]:
# my_pred = my_pred.join(pd.DataFrame(easy_lgbm.predict_proba(test_final_df)), lsuffix='_right', rsuffix='_left')
# my_pred = my_pred.rename({0 : 'Probablity of not joining', 1 : 'Probablity of joining'}, axis=1)
# my_pred

That's it, the project is completed.

What I have done:
1. Loaded Libraries and train data
2. Deleted the unwanted columns.
3. Cleaned some Human Error
4. Label Encoded the data
5. Missing value Imputation via MICE technique
6. Checked for Class Imbalance
7. Splitted data into X and y, Standardized it.
8. Applied SVMSmote and solved class imbalance issue.
9. Applied Easy Ensemble Classifier Model with base estimator as Default LGBMClassifier of Imblearn package
10. Checked the feature importance according to the model using eli5 library
11. Finalized the Easy Ensemble Classifier model with base estimator as Default LGBMClassifier.
12. Predicted on Test Data. 