In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the dataset

In [None]:
df = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
df.head()

## Checking null values

In [None]:
df.isnull().sum()

We have many features and many columns with null-values so one way is to drop columns such as gender, company_size and company_type but then this would not become a real world problem. Because we would always get some null values. Thus we would fill them by "None".

## Exploring the target

In [None]:
df.target.value_counts()

Since, the data is skewed we would be using AUC score as metrics rather than normal accuracy.

## Data type of each column

In [None]:
df.dtypes

## Let us make some graphs to understand things better****

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='city',orient='v')

We can see that count of one city is much much more than other cities.

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df,x='gender')

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='gender')

We can see that the number of males are more than any other in sex and they are also high in proportion inleaving the job.

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='relevent_experience')

Number of people searching for job with relevant expierience is much more than those searching without experience. 

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='enrolled_university')

People who did not go to college are much much larger group who are now searching for a job change.

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='education_level')

The two features education_level and enrollment are very much interrelated we are going to drop one of them. Since education_level has more NA values then enrollement we would be dropping education_level.

In [None]:
df = df.drop(['education_level'],axis=1)

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='major_discipline')

We find that people with professional degree are more likely to search for a new job.

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='experience')

We would be dropping expereince column as we already have relevant experience column and we would be using that

In [None]:
df = df.drop(['experience'],axis=1)

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='company_size')

People having jobs in in startups are more likely to change job much more than people working in corporates.

Let us create a new column which would divide this into four categories startups, medium, large and None for companies whose size is unknown.

In [None]:
df['company_size_new']= "None"

for i in df['company_size'].values:
    if i in ['<10','10/49','50-99']:
        df['company_size_new'][df['company_size']==i]='small'
    elif i in ['100-500','500-999']:
        df['company_size_new'][df['company_size']==i]='medium'
    elif i in ['1000-4999','5000-9999','10000+']:
        df['company_size_new'][df['company_size']==i]='big'

df = df.drop(['company_size'],axis=1)      

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='company_size_new')

Now this looks much a clear picture.

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df[df.target==1],x='company_type')

Since people are likely to leave job more in pvt ltd. 

In [None]:
plt.figure(figsize=(10,10))
ax = plt.subplot(1,1,1)
ax = sns.countplot(data=df,x='last_new_job')

## Building the model

In [None]:
columns = [f for f in df.columns if f not in ('enrollee_id','target')]
numerical_columns = ['city_development_index','training_hours']

In [None]:
from sklearn.preprocessing import LabelEncoder

We would be using label encoding in place of OneHotEncoding

In [None]:
for col in columns:
    if col not in numerical_columns:
        df[col] = df[col].astype(str).fillna("None")
        lbl = LabelEncoder()
        lbl.fit(df[col])
        df[col]= lbl.transform(df[col])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
X = df[columns].values 
y = df['target'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y)

In [None]:
rf = RandomForestClassifier(criterion='gini',max_depth=7,n_estimators = 400)
rf.fit(X_train,y_train)

In [None]:
y_pred= rf.predict_proba(X_test)[:,1]

In [None]:
print('ROC AUC SCORE: {:.3f}'.format(roc_auc_score(y_test,y_pred)))

# This is a good score as we have not done overfitting and not excluded too much training data. Also, we can add more and more complexity to the model but it would hardly import result