In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

from sklearn.metrics import roc_auc_score

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import imblearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OrdinalEncoder

from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold

In [None]:
#load training data
df_data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
df_submit = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')


#  **Exploratory Data Analysis**

In [None]:
#checking data content
df_data.head()

In [None]:
#Inspect data column type and null values
df_data.info(verbose=True)

In [None]:
#statistics for numerical columns
df_data.describe().T

In [None]:
#display none numeric columns
obj_col= df_data.select_dtypes(include='object').columns
obj_col

In [None]:
#view none numeric columns unique elements and count
for obj in obj_col:
    lent= len(df_data[obj].unique())
    print('The unique values in the {} column are below and count is {}'.format(obj,lent))
    print(df_data[obj].unique())
    print()

In [None]:
#check count null values per column
df_data.isnull().sum()

**Visualization**

In [None]:
#plot missing data
plt.figure(figsize=(8,10))
sns.heatmap(df_data.isnull(),cbar=False)
plt.title('Missing data per column heatmap');

In [None]:
#Target and Gender distribution visualization
print(df_data.groupby(['gender','target'])['target'].count())
df_data.gender.value_counts(normalize=True,dropna=False).plot(kind='bar')
plt.title('Gender distribution');

In [None]:
#Target and Major Discipline distribution visualization
print(df_data.groupby(['major_discipline','target'])['target'].count())
df_data.major_discipline.value_counts(normalize=True,dropna=False).plot(kind='bar')
plt.title('Major Discipline Distribution');

In [None]:
#Company size distribution visualization
df_data.company_size.value_counts(normalize=True,dropna=False).plot(kind='bar')
plt.title('Company Size Distribution');

In [None]:
#Company size  and Target distribution visualization
plt.figure(figsize=(10,6))
sns.countplot(data=df_data, x="company_size", hue='target')
plt.title('Company Size Distribution with Target Label');

In [None]:
#Company type distribution visualization
df_data.company_type.value_counts(normalize=True,dropna=False).plot(kind='bar')
plt.title('Company Type Distribution');

In [None]:
#Company type with Target distribution visualization
print(df_data.groupby(['company_type','target'])['target'].count())
plt.figure(figsize=(8,6))
sns.countplot(data=df_data, x="company_type", hue='target');

In [None]:
#enrolled university distribution visualization
df_data.enrolled_university.value_counts(normalize=True,dropna=False).plot(kind='bar')
plt.title('Enrolled University Distribution');

In [None]:
#education_level distribution visualization
df_data.education_level.value_counts(normalize=True,dropna=False).plot(kind='bar')
plt.title('Education_Level Distribution');

In [None]:
#last_new_job distribution visualization
df_data.last_new_job.value_counts(normalize=True,dropna=False).plot(kind='bar')
plt.title('Last New Job Distribution');

In [None]:
#experience distribution visualization
df_data.experience.value_counts(normalize=False,dropna=False).plot(kind='bar')
plt.title('Experience Distribution');

In [None]:
#city distribution visualization
plt.figure(figsize=(16,6))
df_data.city.value_counts(normalize=False,dropna=False).plot(kind='bar')
plt.title('city Distribution');

In [None]:
#Target distribution visualization
df_data.target.value_counts(normalize=True,dropna=False).plot(kind='bar')
plt.title('Target Distribution');

This is an unbalanced dataset

In [None]:
#divide data into test and train data
X = df_data.drop('target', axis=1)
y=df_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify= y)

**Data Cleaning And Imputation**

In [None]:
df_X_train = X_train.copy()
df_y_train = y_train.copy()

In [None]:
#function to clean dataset 
## map some none numerical columns to numeric 

def dataclean(dataframe):
    data = dataframe.copy()
    data.drop('enrollee_id',inplace=True,axis=1)
    #convert experience to an integer column
    data.experience.replace({'>20': 21, '<1': 0},inplace=True)
    data['experience']= data['experience'].astype('float64')

    #convert last_new_job to an numeric column
    data.last_new_job.replace({'>4': 5, 'never': 0},inplace=True)
    data['last_new_job']= data['last_new_job'].astype('float64')

    #map company_size to numeric
    data.company_size.replace({'50-99': 3, '<10': 1,'10000+':8 , '5000-9999': 7,'1000-4999': 6,'10/49': 2, 
                                   '100-500': 4,'500-999': 5},inplace=True)
    data['company_size']= data['company_size'].astype('float64')

    #map gender to numeric
    map_gender = {'Male':0,'Female':1,'Other':2}
    data['gender']=data['gender'].map(map_gender)

    #map enrolled_university to numeric
    map_enrolled_university = {'no_enrollment':0,'Full time course':1,'Part time course':2}
    data['enrolled_university']=data['enrolled_university'].map(map_enrolled_university)


    #map relevent_experience to numeric
    map_relevent_experience = {'Has relevent experience':0,'No relevent experience':1}
    data['relevent_experience']=data['relevent_experience'].map(map_relevent_experience)


    #map education_level to numeric
    map_education_level = {'Primary School':1,'High School':2,'Graduate':3,'Masters':4,'Phd':5}
    data['education_level']=data['education_level'].map(map_education_level)


    #map major_discipline to numeric
    map_major_discipline = {'STEM':0,'Business Degree':1,'Arts':2,'Humanities':3,'No Major':4,'Other':5}
    data['major_discipline']=data['major_discipline'].map(map_major_discipline)


    ##map company_type to numeric
    map_company_type = {'Pvt Ltd':0,'Funded Startup':1,'Early Stage Startup':2,'Other':3,'Public Sector':4,'NGO':5}
    data['company_type']=data['company_type'].map(map_company_type)
    
    df_clean = data.copy()
    
    return df_clean

In [None]:
df_X_train1 = dataclean(df_X_train)

In [None]:
df_X_train1.info()

**The only none numeric column is the 'city' column.**

In [None]:
#visualization correlation between features
#plot correlation of features
plt.figure(figsize=(10,8))
df_cor= pd.concat([df_X_train1,df_y_train],axis=1)

sns.heatmap(df_cor.corr(),annot=True);

In [None]:
#Features for simple imputation
col_simple = [
        'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'last_new_job'
        ]
#initialize simpleimputer
imputer_simple= SimpleImputer(strategy='most_frequent')

In [None]:
#Features for knn imputation
col_knn = ['gender','company_size','company_type']
#initialize knn imputer
imputer_knn = KNNImputer(n_neighbors=3)

In [None]:
#Features for knn imputation
col_enc = ['city']
#initialize encoder
enc = OrdinalEncoder()

In [None]:
#create a transformer
imputer_trans = make_column_transformer(
                (imputer_simple,col_simple),
                (imputer_knn,col_knn),
                (enc,col_enc),
                remainder = 'passthrough'
)

In [None]:
#fit transformer
impute_train =imputer_trans.fit_transform(df_X_train1)

In [None]:
impute_train[:2]

In [None]:
imputer_trans.transformers_

In [None]:
#pass through feature in transformer
col_passthrough = ['city_development_index','relevent_experience','training_hours']

In [None]:
#recreate column names
col_train = col_simple + col_knn + col_enc + col_passthrough
col_train

In [None]:
#create dataframe of imputed features
df_impute = pd.DataFrame(impute_train, columns=col_train)

In [None]:
df_impute.info()

In [None]:
#round up knn imputed features for integers
df_impute[['gender','company_size','company_type']] =df_impute[['gender','company_size',
                                                                'company_type']].round(0)

In [None]:
df_impute.describe().T

In [None]:
df_train_2 = df_impute.copy()

**Prepare testing dataset**

In [None]:
df_X_test = X_test.copy()
df_y_test = y_test.copy()

In [None]:
#clean and impute test dataset
df_X_test_1 = dataclean(df_X_test)
impute_test =imputer_trans.fit_transform(df_X_test_1)
df_impute_test = pd.DataFrame(impute_test, columns=col_train)

In [None]:
#round up float imputation to integers
df_impute_test[['gender','company_size','company_type']] =df_impute_test[['gender',
                                                                          'company_size', 
                                                                          'company_type']].round(0)

In [None]:
df_impute_test.info()

# Modeling

In [None]:
#initialize oversampling for minority class
oversample = imblearn.over_sampling.RandomOverSampler(sampling_strategy='minority')

In [None]:
#fit oversampling
X_train_over, y_train_over= oversample.fit_resample(df_train_2, y_train)

In [None]:
y_train_over.value_counts()

In [None]:
#create train and validation dataset
X_train, X_val, y_train, y_val = train_test_split(X_train_over, y_train_over,
                                                    test_size=0.25, random_state=0, 
                                                  stratify= y_train_over)

In [None]:
#function to measures models
def evalmode(model,features, target):
    acc = model.score(features,target)
    auc = roc_auc_score(target, model.predict_proba(features)[:, 1])
    pred_mode=model.predict(features)
    print(" The model Accuracy is {:.4f}\n".format(acc))
    print(" The model AUC score is {:.4f}\n".format(auc))
    print(classification_report(target, pred_mode))#, target_names=target_names))
    print ("\nConfusion Matrix")
    print (confusion_matrix(target, pred_mode))

In [None]:
#initialize model
model_rf = RandomForestClassifier()

In [None]:
#fit model
model_rf.fit(X_train,y_train)

In [None]:
#evaluate model with validation data
evalmode(model_rf,X_val,y_val)

In [None]:
#evaluate model with test data
evalmode(model_rf,df_impute_test,df_y_test)

Results on test dataset is not good compared with validation dataset which suggest overfitting.
With the target class 1 having poor metrics.
Hence hyperparameter tuning needed

Results on test dataset is not good compared with validation dataset which suggest overfitting.
With the target class 1 having poor metrics.
Hence hyperparameter tuning needed

**Hyperparameter tuning **

In [None]:
model_rf2 = RandomForestClassifier( random_state=1)

In [None]:
#parameters for hyperparameter tuning
param_grid = { 
         "max_depth"        : [12,14,10],
         "max_leaf_nodes"       : [40,30,60],
        "n_estimators"     : [400,200],
        'min_samples_split': [5,7]
             }
scoring = 'roc_auc'
skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 1)

In [None]:
#initialize grid search
search_rf2 = GridSearchCV(estimator=model_rf2, cv=skf.split(X_train,y_train),
                                param_grid=param_grid,scoring=scoring,refit=False)

In [None]:
search_rf2.fit(X_train,y_train)

In [None]:
#display best hyperparameters
search_rf2.best_params_

In [None]:
#load best hyperparameters and train
model_rf_best = RandomForestClassifier(**search_rf2.best_params_)
model_rf_best.fit(X_train,y_train)

In [None]:
#evaluate model with validation data
evalmode(model_rf_best,X_val,y_val)

In [None]:
#evaluate model with test data
evalmode(model_rf_best,df_impute_test,df_y_test)

In [None]:
#plot important features
plt.figure(figsize=(8,10))
sorted_idx = np.argsort(model_rf_best.feature_importances_)
plt.barh(np.array(col_train)[sorted_idx], model_rf_best.feature_importances_[sorted_idx]);


**Submit test dataset**

In [None]:
df_submit_1 = df_submit.copy()

In [None]:
#clean and impute test dataset
df_submit_2 = dataclean(df_submit_1)
impute_df_submit_2 =imputer_trans.fit_transform(df_submit_2)
df_clean_submit_2 = pd.DataFrame(impute_df_submit_2, columns=col_train)

In [None]:
#round up float imputation to integers
df_clean_submit_2[['gender','company_size','company_type']] =df_clean_submit_2[[
                                                                'gender',
                                                                 'company_size',
                                                                'company_type']].round(0)

In [None]:
pred_sub_logpro =model_rf.predict_proba(df_clean_submit_2)

In [None]:
pred_sub_logpro[:4]

In [None]:
submit = pd.concat([df_submit_1['enrollee_id'],
                    pd.DataFrame(pred_sub_logpro[:,1],
                                 columns=['target'])],axis=1)
submit.head()

In [None]:
submit.to_csv('submission.csv')