In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly as py
import plotly.graph_objs as go
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
test=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
train=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

First we are going to check our training and testing datasets:

In [None]:
train.head()

In [None]:
test.head()

Features:

* enrollee_id : Unique ID for candidate
* city: City code
* city_ development _index : Developement index of the city (scaled)
* gender: Gender of candidate
* relevent_experience: Relevant experience of candidate
* enrolled_university: Type of University course enrolled if any
* education_level: Education level of candidate
* major_discipline :Education major discipline of candidate
* experience: Candidate total experience in years
* company_size: No of employees in current employer's company
* company_type : Type of current employer
* lastnewjob: Difference in years between previous job and current job
* training_hours: training hours completed
* target: 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
print ("The training data set has {a} rows and {b} columns".format(a=train.shape[0], b=train.shape[1]))
print ("The testing data set has {a} rows and {b} columns".format(a=test.shape[0], b=test.shape[1]))

# Data Cleaning ==================================================

Lets check how many NAN values do we have and how to deal with them:

In [None]:
(train.isna().sum()/train.shape[0])*100

In [None]:
(test.isna().sum()/test.shape[0])*100

We have many NAN values in both training and test datasets. We are going to check the columns with NAN values one by one to see how should we deal with them:
I am going to start from the columns with the lower number of 'NAN' values:

In [None]:
print ("we have {} number of 'NAN' values in experinece column".format (train.experience.isna().sum()))

In [None]:
# get the 'nan' values from experience column
train[train.experience.isna()]

In [None]:
from scipy.stats import chi2_contingency
for items in list(train.columns):
    chi_res = chi2_contingency(pd.crosstab(train['experience'], train[items]))
    print('{} ===> Chi2 Statistic: {}, p-value: {}'.format(items, round(chi_res[0],3), round(chi_res[1], 4)))

Based on the p-values, except enrollee_id and training_hours, other features can affect the experience. Hence, chi2 feature selection does not help us. We are going to replace 'nan' values with '<1', as it most probably is zero. 

In [None]:
train.experience.fillna('<1', inplace=True)
test.experience.fillna('<1', inplace=True)
print("Null values of test and train data in experience column are {} and {}, respectively".format(test.experience.isna().sum(), test.experience.isna().sum()))

In [None]:
train.experience.unique()

One last thing before moving to another column, the experience column is 'object' data type. However, we need it as a number so lets change the data type to 'int'. The only problem would be '>20' and '<1' values. We are going to replace them with '21'and '0', respectively. Then will change the data type to 'int'. 

In [None]:
test.experience.unique()

In [None]:
train.experience[train.experience=='>20']='21'
train.experience[train.experience=='<1']='0'
pd.to_numeric(train.experience)
test.experience[test.experience=='>20']='21'
test.experience[test.experience=='<1']='0'
pd.to_numeric(test.experience)

Now it is time to fill the 'NAN' values of 'enrolled_university' column:

In [None]:
print ("we have {} number of 'NAN' values in experinece column".format (train.enrolled_university.isna().sum()))

In [None]:
train.education_level.unique()

In [None]:
train[train.education_level.isna()]

We know that if "enrolled_university" is "no_enrollment", then the education can be 'Primary School' or 'High School'. Now lets see which one is most dominant:

In [None]:
a=train[(train.enrolled_university=='no_enrollment') & (train.education_level=='High School')].shape [0]
b=train[(train.enrolled_university=='no_enrollment') & (train.education_level=='Primary School')].shape [0]
print("number of no_enrollment with High School: {}".format(a))
print("number of no_enrollment with High School: {}".format(b))

So, it is more likely to have "High School" than "Primary School" education for no_enrollment.

In [None]:
train.education_level[(train.enrolled_university=='no_enrollment') & (train.education_level.isna())] = "High School"

In [None]:
train[train.education_level.isna()]

In [None]:
train.enrolled_university.unique()

If we have "Full time course" or "Part time course" in "enrolled_university" column, it means we may have "education_level" of ['Phd', 'Graduate', or 'Masters' ]

In [None]:
a=train[(train.enrolled_university=='Full time course') & (train.education_level=='Masters')].shape [0]
b=train[(train.enrolled_university=='Full time course') & (train.education_level=='Phd')].shape [0]
c=train[(train.enrolled_university=='Full time course') & (train.education_level=='Graduate')].shape [0]
print("number of 'Full time course' with Masters degree: {}".format(a))
print("number of 'Full time course' with PhD degree: {}".format(b))
print("number of 'Full time course' with Graduate degree: {}".format(c))

In [None]:
a=train[(train.enrolled_university=='Part time course') & (train.education_level=='Masters')].shape [0]
b=train[(train.enrolled_university=='Part time course') & (train.education_level=='Phd')].shape [0]
c=train[(train.enrolled_university=='Part time course') & (train.education_level=='Graduate')].shape [0]
print("number of 'Part time course' with Masters degree: {}".format(a))
print("number of 'Part time course' with PhD degree: {}".format(b))
print("number of 'Part time course' with Graduate degree: {}".format(c))

In [None]:
train.education_level[(train.enrolled_university=='Full time course') & (train.education_level.isna())] = "Graduate"
train.education_level[(train.enrolled_university=='Full time course') & (train.education_level.isna())] = "Graduate"
test.education_level[(test.enrolled_university=='Full time course') & (test.education_level.isna())] = "Graduate"
test.education_level[(test.enrolled_university=='Full time course') & (test.education_level.isna())] = "Graduate"

We are going to drop the rest of the 'NAN' values for "education_level":

In [None]:
train.drop((train.education_level[train.education_level.isna()]).index, inplace=True)
test.drop((test.education_level[test.education_level.isna()]).index, inplace=True)

Now, lets check "enrolled_university" column:

In [None]:
train.enrolled_university.unique()

In [None]:
sns.countplot(x='enrolled_university', hue='education_level', data=train)

As can be seen from the countplot, the most common term in the enrolled_university column is "no_enrollment" regardless of education level. Hence, we are going to replace 'NAN' values with "no_enrollment".

In [None]:
train.enrolled_university[train.enrolled_university.isna()]="no_enrollment"
test.enrolled_university[test.enrolled_university.isna()]="no_enrollment"

Now trying to replace "NAN" values of "last_new_job" column:

In [None]:
train.last_new_job.unique()

It has a great chance that the one who has no relevent experience, the difference in years between previous job and current job be 'never'. 

In [None]:
train.last_new_job[(train.last_new_job.isna()) & (train.relevent_experience=='No relevent experience')]='never'
test.last_new_job[(test.last_new_job.isna()) & (test.relevent_experience=='No relevent experience')]='never'

I am going to drop the remaining 'nan' values.

In [None]:
train.drop((train.last_new_job[train.last_new_job.isna()]).index, inplace=True)
test.drop((test.last_new_job[test.last_new_job.isna()]).index, inplace=True)

In [None]:
train.last_new_job[train.last_new_job=='>4']='5'
train.last_new_job[train.last_new_job=='never']='0'
pd.to_numeric(train.last_new_job)
test.last_new_job[test.last_new_job=='>4']='5'
test.last_new_job[test.last_new_job=='never']='0'
pd.to_numeric(test.last_new_job)

Now, we have 4 columns with a high number of 'nan' values: "gender", "major_discipline", "company_size", and "company_type"

In [None]:
(train.isna().sum()/train.shape[0])*100

In [None]:
train2=train.fillna('nan')
fig=go.Figure(data=[go.Pie(labels=['Male', 'nan', 'Female', 'Other'], values=train2.gender.value_counts())])
fig.update_traces( hole= 0.3, hoverinfo='label+percent', textinfo='value', textfont_size=20)
fig.show()

As can be seen from the pie-plot, we are dealing with an extremly unbalanced data. The majority of the data are 'Male', while small percentage are "Female" or "Other". Now, lets check if this column has a significant effect on "target" column or not:

In [None]:
M=train[(train.gender=='Male') & (train.target==1.0)].shape[0]/train[(train.gender=='Male') & (train.target==0.0)].shape[0]
F=train[(train.gender=='Female') & (train.target==1.0)].shape[0]/train[(train.gender=='Female') & (train.target==0.0)].shape[0]
O=train[(train.gender=='Other') & (train.target==1.0)].shape[0]/train[(train.gender=='Other') & (train.target==0.0)].shape[0]
N=train[(train.gender.isna()) & (train.target==1.0)].shape[0]/train[(train.gender.isna()) & (train.target==0.0)].shape[0]
# if we add all'nan' values to 'Male'
M_and_NAN=train[((train.gender=='Male') | (train.gender.isna())) & (train.target==1.0)].shape[0]/train[((train.gender=='Male') | (train.gender.isna())) & (train.target==0.0)].shape[0]

In [None]:
fig, axs = plt.subplots(1,2, figsize=(12,5))
sns.countplot(x='gender', hue='target', data=train, ax=axs[0])
sns.barplot(x=['Male', 'Female', 'Other', 'NAN', 'Male+NAN'], y=[M, F, O, N, M_and_NAN], ax=axs[1])

As can be seen from the figures, we have relatively the same portion of target 0 and target 1 for all genders if we add all NAN's to Male. Hence, it is better to replace null values with 'Male'. In general, as can be seen from the barplot, the gender does not have a significant effect on the target and we may want to drop the whole column. 

In [None]:
train.gender[train.gender.isna()]="Male"
test.gender[test.gender.isna()]="Male"

Now, trying to fill the null values of major_discipline  column:

In [None]:
train.major_discipline.value_counts()

In [None]:
train[train.major_discipline.isna()]

It seems the educational level has a significant effect on the major_discipline. Obviously, there is no discipline for "High School" and "Primary School". Therefore, Null values in this column is not missed values but means "not applicable". So we are going to replace null values with "Not applicable" when the dicipline is "High School" or "Primary School

In [None]:
train.major_discipline[(train.major_discipline.isna()) & ((train.education_level=='High School') | (train.education_level=='Primary School'))]='Not Applicable'
test.major_discipline[(test.major_discipline.isna()) & ((test.education_level=='High School') | (test.education_level=='Primary School'))]='Not Applicable'

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='education_level', hue='major_discipline', data=train)

As can be seen from the above graph, we can replace the remaining Null values with STEM as it is most likely.

In [None]:
train.major_discipline[train.major_discipline.isna()]='STEM'
test.major_discipline[test.major_discipline.isna()]='STEM'

The null values of company types and company size are very challanging to fill. On the other hand we want to have these columns for our prediction.

in the almost 28% of the data, we do not have any information about company type and company size. 

In [None]:
train[(train.company_type.isna()) & (train.company_size.isna())].shape[0]/train.shape[0]

In [None]:
train.company_type.value_counts()

In [None]:
fig, ax =plt.subplots(6,1, figsize=(15,20))
sns.countplot(train.company_type, ax=ax[0])
sns.countplot(x='company_type', hue='major_discipline', data=train, ax=ax[1])
sns.countplot(x='company_type', hue='education_level', data=train, ax=ax[2])
sns.countplot(x='company_type', hue='relevent_experience', data=train, ax=ax[3])
sns.boxplot(x=train.company_type, y=train.training_hours, ax=ax[4])
sns.countplot(x='company_type', hue='company_size', data=train, ax=ax[5])

Conclusion: The majority of company_types are "Pvt Ltd". If one's major is not "STEM", he/she most probably is hired in the "Pvt Ltd" companies. "no relevant experience" got a job mainly in either "Pvt Ltd" or "Public Sector".

In [None]:
train.company_type[(train.major_discipline!="STEM") & (train.company_type.isna())]="Pvt Ltd"
test.company_type[(test.major_discipline!="STEM") & (test.company_type.isna())]="Pvt Ltd"
train.company_type[(train.major_discipline=="Not Applicable") & (train.company_type.isna())]="Pvt Ltd"
test.company_type[(test.major_discipline=="Not Applicable") & (test.company_type.isna())]="Pvt Ltd"
train.company_type[(train.education_level=="Masters") & (train.company_type.isna())]="Pvt Ltd"
test.company_type[(test.education_level=="Masters") & (test.company_type.isna())]="Pvt Ltd"
train.company_type[(train.relevent_experience=="No relevent experience") & (train.company_type.isna())]="Public Sector"
test.company_type[(test.relevent_experience=="No relevent experience") & (test.company_type.isna())]="Public Sector"
# the percentages of null values is reduced from 30% to 8% by these methods

It is very difficult to guess the remaining Null values in the Company_type, hence, we are going to use the mode method to replace the remaining null values. However, we are going to edit them in the new dataset to use it in the modeling after visulization. I am also going to drop the company_size for now. If you find a better way to fill the null values in these two columns please let me know in the comment :)

In [None]:
train2=train.drop(['company_size'], axis=1).dropna()
test2=test.drop(['company_size'], axis=1).dropna()

In [None]:
train2.shape

In [None]:
test2.shape

# Data Visulization =============================================

We are going to discover the relation btw different features and "target" feature. Lets start from the target data:

In [None]:
fig=go.Figure(data=[go.Pie(labels=['Not looking for job change', 'Looking for a job change'], values=train.target.value_counts())])
fig.update_traces( hole= 0.3, hoverinfo='label+percent', textinfo='value+percent', textfont_size=20)
fig.show()

Almost 1/4 of the employees are looking for a job change. Lets see which groups are mainly trying to change their jobs :)

In [None]:
sns.countplot(x='target', hue='education_level', data=train)

In [None]:
P=100*train[(train.education_level=='Phd') & (train.target==1.0)].shape[0]/train[(train.education_level=='Phd') & (train.target==0.0)].shape[0]
G=100*train[(train.education_level=='Graduate') & (train.target==1.0)].shape[0]/train[(train.education_level=='Graduate') & (train.target==0.0)].shape[0]
M=100*train[(train.education_level=='Masters') & (train.target==1.0)].shape[0]/train[(train.education_level=='Masters') & (train.target==0.0)].shape[0]
H=100*train[(train.education_level=='High School') & (train.target==1.0)].shape[0]/train[(train.education_level=='High School') & (train.target==0.0)].shape[0]
P=100*train[(train.education_level=='Primary School') & (train.target==1.0)].shape[0]/train[(train.education_level=='Primary School') & (train.target==0.0)].shape[0]

In [None]:
from plotly.offline import iplot, init_notebook_mode
figu=go.Figure(data=go.Bar( x = ['Phd', 'Graduate','Masters','High School','Primary School' ],
                y = [P, G, M, H, P],
                
                marker = dict(color = 'rgba(255, 174, 255, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5))),
              layout=dict(title = "The Percentage of ['not looking for a job' / 'looking for a job']",
              yaxis= dict(title= 'Percentage',ticklen= 5,zeroline= False)
             ))
iplot(figu)

It seems "Graduate"s are more likely to look for a new job compared to other education levels. "Phd"s are less likely to change their job (look for new job), maybe they are already making a good money :)

In [None]:
figu = go.Figure(data=go.Box(x=train['target'], y=train['city_development_index'],
                         notched=True,
                         fillcolor='rgba(0,255,0,0.5)'), 
                 layout=dict(title = "City development index vs. Target",
                 yaxis= dict(title= 'City development index',ticklen= 5,zeroline= False),
                 xaxis= dict(title= 'Target',ticklen= 5,zeroline= False)
                            ))

iplot(figu)

Employee's in the cities with higher development index have more tendency to do not change their job. 

In [None]:
M=train[(train.gender=='Male') & (train.target==1.0)].shape[0]/train[(train.gender=='Male') & (train.target==0.0)].shape[0]
F=train[(train.gender=='Female') & (train.target==1.0)].shape[0]/train[(train.gender=='Female') & (train.target==0.0)].shape[0]
O=train[(train.gender=='Other') & (train.target==1.0)].shape[0]/train[(train.gender=='Other') & (train.target==0.0)].shape[0]
fig, ax=plt.subplots(1,2, figsize=(15,5))
sns.countplot(x='target', hue='gender', data=train, ax=ax[0])
sns.barplot(x=['Male','Female','Other'], y=[M, F, O], ax=ax[1])

As previously talked, gender does not have a significant effect on target. All genders have btw 30-35% tendency to change their jobs.

In [None]:
train.major_discipline.unique()

In [None]:
M=[]
values=['STEM', 'Business Degree', 'Not Applicable', 'Arts', 'Humanities','No Major', 'Other']
for items in values:
    M.append(train[(train.major_discipline==items) & (train.target==1.0)].shape[0]/train[(train.major_discipline==items) & (train.target==0.0)].shape[0])
figu=go.Figure(data=(go.Bar(x=values, y=M, 
                           marker=dict(color = 'rgba(355, 50, 55, 1.0)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))),
              layout=dict(title= 'Likelihood of seeking new job vs. major discipline',yaxis= dict(title= 'Likelihood of seeking a new job',ticklen= 5,zeroline= False)))


iplot(figu)

Those who do not have any discipline, Arts, and Humanities are less likely to seek for a new job compared to other disciplines.

In [None]:
train.enrolled_university.unique()

In [None]:
E=[]
values=['no_enrollment', 'Full time course', 'Part time course']
for items in values:
    E.append(train[(train.enrolled_university==items) & (train.target==1.0)].shape[0]/train[(train.enrolled_university==items) & (train.target==0.0)].shape[0])
figu=go.Figure(data=(go.Bar(x=values, y=E, 
                           marker=dict(color = 'rgba(55, 50, 100, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))),
              layout=dict(title= 'Likelihood of seeking new job vs. enrolled_university',yaxis= dict(title= 'Likelihood of seeking a new job',ticklen= 5,zeroline= False)))


iplot(figu)

Those who enrolled a "Full time course"s in the university are more likely to seek for a new job. Why?!

In [None]:
train.relevent_experience.unique()

In [None]:
R=[]
values=['Has relevent experience', 'No relevent experience']
for items in values:
    R.append(train[(train.relevent_experience==items) & (train.target==1.0)].shape[0]/train[(train.relevent_experience==items) & (train.target==0.0)].shape[0])
figu=go.Figure(data=(go.Bar(x=values, y=E, 
                           marker=dict(color = 'rgba(55, 50, 200, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))),
              layout=dict(title= 'Likelihood of seeking new job vs. relevent_experience',yaxis= dict(title= 'Likelihood of seeking a new job',ticklen= 5,zeroline= False)))


iplot(figu)

Those who does not have any relevant experience, looking for a new job more often. Maybe they did not have too much idea about their job and they are not too interested.

In [None]:
figu = go.Figure(data=go.Box(x=train['target'], y=train['experience'],
                         notched=True,
                         fillcolor='rgba(0,255,0,0.5)'), 
                 layout=dict(title = "experience vs. Target",
                 yaxis= dict(title= 'experience',ticklen= 5,zeroline= False),
                 xaxis= dict(title= 'Target',ticklen= 5,zeroline= False)
                            ))

iplot(figu)

# Feature Engineering

In [None]:
train.head()

First we are going to edit city column. We are going to remove "city_"

In [None]:
train.city=train.city.str.strip('city_')
test.city=test.city.str.strip('city_')

Now, going to convert categorical data:

In [None]:
train=pd.concat([train, pd.get_dummies(train['relevent_experience'], drop_first=True)], axis=1).drop(['relevent_experience'], axis=1)
test=pd.concat([test, pd.get_dummies(test['relevent_experience'], drop_first=True)], axis=1).drop(['relevent_experience'], axis=1)

In [None]:
train.shape

In [None]:
train.isna().sum()

Since we have 'Other' in gender column, we are going to change "Other" in major_discipline to "Another"

In [None]:
train.major_discipline[train.major_discipline=='Other']='Another'

In [None]:
from sklearn.preprocessing import OneHotEncoder
Cat_c=['gender', 'enrolled_university','major_discipline', 'education_level']
for items in Cat_c:
    le=OneHotEncoder()
    t=le.fit_transform(train[[items]]).toarray()
    a=train[items].unique()
    indexs=np.unique(a, return_index=True)[1]
    col=[a[indexs] for index in sorted(indexs)]
    new=pd.DataFrame(t, columns=col[1])
    train=pd.concat([train, new], axis=1, join='inner')

In [None]:
from sklearn.preprocessing import OneHotEncoder
Cat_c=['gender', 'enrolled_university','major_discipline', 'education_level']
for items in Cat_c:
    le=OneHotEncoder()
    t=le.fit_transform(test[[items]]).toarray()
    a=test[items].unique()
    indexs=np.unique(a, return_index=True)[1]
    col=[a[indexs] for index in sorted(indexs)]
    new=pd.DataFrame(t, columns=col[1])
    test=pd.concat([test, new], axis=1, join='inner')

In [None]:
train.columns

In [None]:
train.drop(['gender', 'enrolled_university','major_discipline','company_size','company_type', 'education_level'], axis=1, inplace=True)
test.drop(['gender', 'enrolled_university','major_discipline','company_size','company_type', 'education_level'], axis=1, inplace=True)

In [None]:
train.head()

In [None]:
test.head()

Now it is time to model :)

In [None]:
train.columns

In [None]:
X=train.drop(['target'], axis=1)
y=train['target']

In [None]:
test.last_new_job.value_counts()

# Model Section

In [None]:
X.city=pd.to_numeric(X.city)
X.experience=pd.to_numeric(X.experience)
X.last_new_job=pd.to_numeric(X.last_new_job)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,y_train)
prediction=model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(prediction, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(X_train,y_train)
prediction=model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(prediction, y_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
n_est=[50, 100, 150, 200, 250]
for items in n_est:
    model=RandomForestClassifier(items)
    model.fit(X_train,y_train)
    prediction=model.predict(X_test)
    print('{} : {}'.format(items, accuracy_score(prediction, y_test)))

In [None]:
from xgboost import XGBClassifier
model=XGBClassifier()
model.fit(X_train,y_train)
prediction=model.predict(X_test)
accuracy_score(prediction, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(prediction, y_test)

1. Logestic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X,y)
prediction=model.predict(test)
output=pd.DataFrame({'enrollee_id':test.enrollee_id, 'target':prediction})
output
#pd.DataFrame.to_csv(output)

2. Decision Tree:

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(X,y)
prediction=model.predict(test)
output=pd.DataFrame({'enrollee_id':test.enrollee_id, 'target':prediction})
output

3. RandomForest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=250)
model.fit(X,y)
prediction=model.predict(test)
output=pd.DataFrame({'enrollee_id':test.enrollee_id, 'target':prediction})
output