# HR analytics job change of data sciencetists

Exploring what features are good predictors of people changing jobs and building some models to predict.

# import and data loading

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df= pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
df.head()

# Exploring the data

Just describing the data so we can see what we have on our hands. Observations

- 25% of users are looking to change jobs, this is low and we will need to consider this when training models - a model could predict everyting 0 and still have 75% accuracy.
- 19,158 samples
- Training hours and city development index appear to be the only numeric features.

In [None]:
df.describe()

## Investigating numeric feautres

- It appears that the training hours are slightly longer for target == 0
- We also see that the mean and median city development index is slightly higher for target == 0.

In the city development index there appears to be two distributions layered on top of one another. Possibly clustering the two distributions could create a useful feature.


### training hours

In [None]:
f,ax = plt.subplots()
df.training_hours.hist(ax=ax)
f,ax = plt.subplots()
df[['training_hours', 'target']].boxplot(by='target', ax=ax)

In [None]:
df.groupby(['target']).training_hours.describe()[['mean', '50%']].T.plot(kind='barh')

### City Development Index

In [None]:
f,ax = plt.subplots()
df.city_development_index.hist(ax=ax)
f,ax = plt.subplots()
df[['city_development_index', 'target']].boxplot(by='target', ax=ax)

In [None]:
df.groupby(['target']).city_development_index.describe()[['mean', '50%']].T.plot(kind='barh')

In [None]:
for name, group in df.groupby(['target']):
    f,ax = plt.subplots()
    group.city_development_index.hist(ax=ax)

# Understanding the categorical data

In [None]:
categorical_columns = [
    'gender', 'relevent_experience', 'enrolled_university',
    'education_level', 'major_discipline', 'experience',
    'company_size', 'company_type', 'last_new_job',
]

In [None]:
# First lets see how many categories are in each categorical feature.
df[categorical_columns].nunique()

## Exploring Experience

It appears there is some correlation between the experience level and the target, higher experience are more likely to have target == 0. Lower experience are more likely to have target ==1.

In [None]:
# Casting to an int.
df['experience_int'] = np.where(
    df.experience.str.contains('>20'),
    21,
    np.where(
        df.experience.str.contains('<1'),
        0,
        df.experience
    )
).astype('int')

In [None]:
def plot_cat_comp(category):
    f, ax = plt.subplots()
    ax = (
        df
        .fillna('nan')
        .groupby(['target',category])
        .agg({'enrollee_id':'nunique'})
        .join(df.groupby('target').agg(total=('enrollee_id', 'nunique')))
        .assign(
            percentage = lambda x: x['enrollee_id']/x['total']
        )
        .reset_index()
        .pivot(
            index=category,
            columns='target',
            values='percentage'
        )
        .plot(
            kind='barh',
            ax=ax
        )
    )

    return ax

In [None]:
plot_cat_comp('experience_int')

## Exploring other categoricals

In [None]:
for cat in categorical_columns:
    ax = plot_cat_comp(cat)
    ax.set_title(cat,x=0, ha='left')

# Prepare the data for ML

In [None]:
df_process = df.copy()
df_process[categorical_columns] = df_process[categorical_columns].fillna('nan')
df_process = pd.get_dummies(df, columns = ['company_size', 'relevent_experience', 'company_type' , 'education_level'])

In [None]:
features = [
       'city_development_index',
       'training_hours', 'experience_int', 'company_size_10/49',
       'company_size_100-500', 'company_size_1000-4999', 'company_size_10000+',
       'company_size_50-99', 'company_size_500-999', 'company_size_5000-9999',
       'company_size_<10', 'relevent_experience_Has relevent experience',
       'relevent_experience_No relevent experience',
       'company_type_Early Stage Startup', 'company_type_Funded Startup',
       'company_type_NGO', 'company_type_Other', 'company_type_Public Sector',
       'company_type_Pvt Ltd', 'education_level_Graduate',
       'education_level_High School', 'education_level_Masters',
       'education_level_Phd', 'education_level_Primary School'
]


# Model Selection

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_process[features]
y = df_process['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import roc_auc_score

In [None]:
rf_pipe = Pipeline(steps =[ ("RF",RandomForestClassifier(random_state=42)) ])
ada_pipe = Pipeline(steps =[ ("RF",AdaBoostClassifier(random_state=42,learning_rate=0.7)) ])
svm_pipe = Pipeline(steps =[ ("RF",SVC(random_state=42,kernel='rbf')) ])
dt_pipe = Pipeline(steps = [('RF', DecisionTreeClassifier(max_depth=10))])

In [None]:
rf_cross_val_scores = cross_val_score(rf_pipe,X_train,y_train,cv=5,scoring='f1')
ada_f1_cross_val_scores=cross_val_score(ada_pipe,X_train,y_train,cv=5,scoring='f1')
dt_f1_cross_val_scores=cross_val_score(dt_pipe,X_train,y_train,cv=5,scoring='f1')


In [None]:
rf_pipe.fit(X_train,y_train)
rf_prediction = rf_pipe.predict(X_test)

ada_pipe.fit(X_train,y_train)
ada_prediction = ada_pipe.predict(X_test)

dt_pipe.fit(X_train,y_train)
dt_prediction = dt_pipe.predict(X_test)

print('Area under ROC Score of Random Forest Model On Test Set - {:,.2%}'.format(roc_auc_score(rf_prediction,y_test)))
print('Area under ROC Score of AdaBoost Model On Test Set - {:,.2%}'.format(roc_auc_score(ada_prediction,y_test)))
print('Area under ROC Score of Decision Tree Model On Test Set - {:,.2%}'.format(roc_auc_score(dt_prediction,y_test)))


Thats an ok baseline. Will start a new notebook to try and improve upon these scores.