# HR Analytics

## Import library and dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.feature_selection import RFECV

In [None]:


train=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
sample_submission=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv')

In [None]:
train.shape,test.shape

In [None]:
train.head()

In [None]:
test.head()

14 columns with target is our y variable.

In [None]:
train['target'].value_counts(normalize=True)

The target column which is the predictor variable seems to be imbalanced .We have 75% of rows as 0 whereas 25% is 1.

In [None]:
train.dtypes

## Category columns

In [None]:
obj_cols=train.select_dtypes('object').columns
numeric_cols=[c for c in train.columns if c not in obj_cols if c not in ('target')]

Lets check if there are any new categories available in test which are not present in train,

In [None]:
for c in obj_cols:
    if list(set(test[c])-set(train[c])):
        print(f"For column {c} Available only in test are {list(set(test['city'])-set(train['city']))}")
    print("No instances found")

There are no new categories available in testset.Lets combine both train and test for the next section of our analysis.

In [None]:
test.loc[:,'target']=-1
data=pd.concat([train,test],ignore_index=True)

In [None]:
#https://www.kaggle.com/artgor/is-this-malware-eda-fe-and-lgb-updated
stat_cols=[]
for c in obj_cols:
    stat_cols.append((c,data[c].nunique(),data[c].isnull().sum()*100/data[c].shape[0],data[c].value_counts(normalize=True,dropna=False).values[0]*100))
    stat_df=pd.DataFrame(stat_cols,columns=['column_name','unique_values','null_value_perc','perc_of_max_value'])
    stat_df.sort_values('unique_values',ascending=False,inplace=True)
stat_df

The above table provides a summary of the categorical columns.City and relevent experience have no null values whereas company_type has 32 % null values.The cardinality of city and experience is higher.75 % of the columns in major discipline are of the same category  followed similarly in enrolled_university,relevent_experience columns.Lets first handle null columns.

## Handling null values

Lets consider both training and test sets for our analysis.

In [None]:
data['company_type'].value_counts(normalize=True,dropna=False)

We use a technique described in Abhishek Thakur's book for imputing missing values.NaN's will be considered a separate category and imputed.

In [None]:
null_cols=[c for c in obj_cols if c not in ['city','relevent_experience']]
for n in null_cols:
    print(f'Imputing null values in column {n}')
    data.loc[:,n]=data[n].fillna(f'NONE_{n}').astype('str')

Lets check company type column again,

In [None]:
data['company_type'].value_counts(normalize=True,dropna=False)

In [None]:
stat_cols=[]
for c in obj_cols:
    stat_cols.append((c,data[c].nunique(),data[c].isnull().sum()*100/data[c].shape[0],data[c].value_counts(normalize=True,dropna=False).values[0]*100))
    stat_df=pd.DataFrame(stat_cols,columns=['column_name','unique_values','null_value_perc','perc_of_max_value'])
    stat_df.sort_values('unique_values',ascending=False,inplace=True)
stat_df

## Preparing categorical data for Model Input

For categorical data with cardinality less than 5 , we use one hot encoding while for cardinality greater than 5 we use frequency encoding.Inorder to avoid data leak,we do the frequency encoding in our cross validation setup.

In [None]:
for c in ['relevent_experience','enrolled_university','gender']:
    temp=pd.get_dummies(data[c],prefix='OHE')
    data=pd.concat([data,temp],axis=1)
    print(f'OHE {c}.Now removing original column {c} from df')
    data.drop(c,axis=1,inplace=True)

Thus all null values in the column is taken care of.Lets now check the numerical columns.

## Numeric columns

In [None]:
data[numeric_cols].isnull().sum()

No Null values in numeric columns.From the column name it is seen that enrolee_id is more nominal rather than a continuous column.Lets change the dtype.

In [None]:
data.loc[:,'enrollee_id']=data.loc[:,'enrollee_id'].astype('object')

Now,we split the data again into train and test.

In [None]:
train=data.loc[data['target']!=-1,:].reset_index(drop=True)
test=data.loc[data['target']==-1,:].reset_index(drop=True)

In [None]:
X=train.drop('target',axis=1)
y=train.target.values

In [None]:
#Distribution of numeric columns:
fig,ax=plt.subplots(figsize=(12,10))
plt.subplot(2,2,1)
sns.distplot(train['city_development_index'],color='darkblue')
plt.title("Distribution of city development index-Train",fontsize=15)
plt.xlabel('City development index',fontsize=10)
plt.ylabel('frequency')
plt.subplot(2,2,2)
sns.distplot(test['city_development_index'],color='violet')
plt.title("Distribution of city development index-Test",fontsize=15)
plt.xlabel('City development index',fontsize=10)
plt.ylabel('frequency')
plt.subplot(2,2,3)
sns.distplot(train['training_hours'],color='darkblue')
plt.title("Distribution of training hours-Train",fontsize=15)
plt.xlabel('Training Hours',fontsize=10)
plt.ylabel('frequency')
plt.subplot(2,2,4)
sns.distplot(test['training_hours'],color='violet')
plt.title("Distribution of training hours-Test",fontsize=15)
plt.xlabel('Training Hours',fontsize=10)
plt.ylabel('frequency')

In [None]:
train['training_hours'].describe(),test['training_hours'].describe()

* The distribution of city development index looks similar in both training and test set.The development index is having a peak at values 0.9 and 0.6.The range is also between 0.4 to 1.0
* Training hours is right skewed with peak between 0-50.The range is also similar.

## Building a baseline model

Since there is an imbalance in target,we use stratifiedKfold for cross validation.

In [None]:
freq_cols=['city','experience','company_size','major_discipline','company_type','last_new_job','education_level']
num_cols=['city_development_index','training_hours']

In [None]:
required_cols=[c for c in X.columns if c not in ('enrollee_id')]

In [None]:
folds=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [None]:
def freq_encode(trn_df,val_df,cols):
    for c in cols:
        df=pd.concat([trn_df[[c]],val_df[[c]]])
        foo=df[c].value_counts().to_dict()
        trn_df[c]=trn_df[c].map(foo)
        val_df[c]=val_df[c].map(foo)
    return trn_df[cols],val_df[cols]

In [None]:
pred_df=np.zeros(len(test))
scores=[]
roc=[]
for i,(trn_idx,val_idx) in enumerate(folds.split(X,y)):
    print(f'***Starting fold {i+1}***')
    trn_x,trn_y=X[required_cols].iloc[trn_idx],y[trn_idx]
    val_x,val_y=X[required_cols].iloc[val_idx],y[val_idx]
    trn_x[freq_cols],val_x[freq_cols]=freq_encode(trn_x,val_x,freq_cols)
    #val_x[freq_cols]=freq_encode(val_x,freq_cols)
    clf=RandomForestClassifier(n_estimators=1000,oob_score=True,n_jobs=-1,random_state=40,max_features='sqrt')
    clf.fit(trn_x,trn_y)
    preds=clf.predict(val_x)
    score=f1_score(val_y,preds)
    roc_score=roc_auc_score(val_y,preds)
    scores.append(score)
    roc.append(roc_score)
    print(f'F1 score for fold {i+1} is {score} ROC score {roc_score}')
    
    test[freq_cols],_=freq_encode(train[required_cols],test[required_cols],freq_cols)
    test_preds=clf.predict(test[required_cols])
    pred_df+=test_preds
print(f'Average f1 score for 5 folds {np.mean(scores)} .Avg roc score for 5 folds {np.mean(roc)}')
pred_df/=5
    

The model with Random forest classifier is not the best.Lets try to improve this baseline score.

In [None]:
train_df=X[required_cols].copy()
train_df[freq_cols],_=freq_encode(train,test,freq_cols)

In [None]:
##recursive feature elimination with cross validation:
model=RandomForestClassifier(n_estimators=1000,oob_score=True,n_jobs=-1,random_state=40,max_features='sqrt')
rfecv=RFECV(estimator=model,
           cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42).split(train_df[required_cols],y),
           step=5,
           scoring='roc_auc',
           verbose=2)
rfecv.fit(train_df[required_cols],y)


In [None]:
print(f"Optimal number of features {rfecv.n_features_}")


In [None]:
plt.figure(figsize=(8,8))
plt.plot(range(1,len(rfecv.grid_scores_)+1),rfecv.grid_scores_)
plt.xlabel('Number of features selected')
plt.ylabel('Cross validation score')
plt.show()

In [None]:
ranking=pd.DataFrame({'features':required_cols})
ranking['Rank']=np.asarray(rfecv.ranking_)
ranking.sort_values('Rank',ascending=False)

In [None]:
score=np.max(rfecv.grid_scores_)
score

In [None]:
sample_submission['target']=rfecv.predict(test[required_cols])

In [None]:
sample_submission.head()

In [None]:
sample_submission['target'].value_counts()

## Reference

1.Artgor's [Malware prediction kernel](https://www.kaggle.com/artgor/is-this-malware-eda-fe-and-lgb-updated)

2.Abhishek Thakur's [Approaching Almost any Machine Learning Problem](https://www.amazon.in/Approaching-Almost-Machine-Learning-Problem-ebook/dp/B089P13QHT/ref=sr_1_1?crid=3VJFCEROX0U8&dchild=1&keywords=approaching+almost+any+machine+learning+problem&qid=1609395728&sprefix=approachin%2Caps%2C317&sr=8-1)

3.[Recursive feature elimination with CV](https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html#sphx-glr-auto-examples-feature-selection-plot-rfe-with-cross-validation-py)

4.[eliminate features recursively](https://www.kaggle.com/tilii7/eliminate-features-recursively-cv)