# HR Analytics

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn import preprocessing
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report,accuracy_score
from matplotlib import pyplot
from xgboost import plot_importance
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
df.head()

# Data Profiling

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

Replacing null values with 'Unknown' for analytics purpose

In [None]:
df1 = df.fillna("Unknown")
df1.isnull().sum()

# Data Analysis

In [None]:
ax = sns.countplot(x="target",data=df1, hue="gender")
total =float(len(df))

plt.title("looking for Job change or not ?")
plt.xlabel("looking for job change")

for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='right')
plt.show()


**Observation:**
* Around 25% people are lookging for a job change
* Out of which, 15.7% people are male
* Around 80% people are currently not looking for job change

In [None]:
ax = sns.countplot(x="target",data=df1, hue="relevent_experience")
total =float(len(df))

plt.title("looking for Job change or not ?")
plt.xlabel("looking for job change")

for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='right')
plt.show()


Observation:
* Out of those who looks for job change, 15.5 % have relevent experience.

In [None]:
ax = sns.countplot(x="education_level",data=df1[df1['target']==1])
total_1 =float(len(df[df['target']==1]))
plt.title("Educational level of persons looking for job change")
plt.xlabel("Educational level")

for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total_1)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='right')
plt.show()


Observation:
* Graduates are more likely to look for a job change. 

In [None]:
sns.boxplot(x="target",y="training_hours",data=df1,palette="Set1")
plt.title("Distribution of training hours")
plt.xlabel("looking for job change")

**Observation:**
*  Average training hours completed is almost same for people who are looking for job and not. It means training hours doesnt have impact here

In [None]:
sns.boxplot(x="target",y="city_development_index",data=df1,palette="Set1")
plt.title("Distribution of city development index")
plt.xlabel("looking for job change")

**Observation**
* Average city development index for people who are looking for job change is less. So, people from city which has higher development index are less likely to look for job change

In [None]:
major_discipline = df1[df1['target'] == 1]['major_discipline']
values = major_discipline.value_counts()
labels = values.keys()
bar,ax = plt.subplots(figsize=(8,8))
plt.pie(x = values, labels = labels , autopct="%.2f%%",pctdistance=0.8)
plt.title('looking for job change (major discipline wise)', fontsize=20)

**Observation**:
* Around 80% of STEM degree holders are looking fot job change. So, It might be possible that they are more likely to look for job changes 

In [None]:
ax = sns.countplot(x="enrolled_university",data=df1[df1['target']==1])
total_1 =float(len(df[df['target']==1]))
plt.title("University enrollment status of persons looking for job change")
plt.xlabel("University enrollment status")

for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total_1)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='right')
plt.show()

**Observation:**
* Around 37% of people who are leaving the job are due to higher studies. Around 61% of the people havent enrolled in any educational courses. So, they might be leaving the job due to other reasons such as salary issues, job satisfaction, etc

In [None]:
ax = sns.countplot(x="last_new_job",data=df1[df1['target']==1])
total_1 =float(len(df[df['target']==1]))
plt.title("Difference in years between current job and previous job")
plt.xlabel("Difference in years")

for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total_1)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='right')
plt.show()

**Observation:**
* Around 44.5 % of the people who are looking out for job change have only one year of difference between their current job and previus job. One of the possible reasons could be, they might not like the work culture of the company

In [None]:
ax = sns.countplot(x="experience",data=df1[df1['target']==1])
total_1 =float(len(df[df['target']==1]))
plt.title("Work experience")
plt.xlabel("Work experience")

for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total_1)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()

**Observation**
* People with experience 2,3,4,5 and >20 are mor likely to look for job change

# Data preparation and encoding

In [None]:
#Checking for percentage of missing values in each columns
(df.isnull().sum()/len(df))*100

* Replacing the missing values by mode value for columns that have more than 5% of missing values

In [None]:
#Mode Imputation
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['major_discipline'] = df['major_discipline'].fillna(df['major_discipline'].mode()[0])
df['company_size'] = df['company_size'].fillna(df['company_size'].mode()[0])
df['company_type'] = df['company_type'].fillna(df['company_type'].mode()[0])

In [None]:
#Dropping rest of the null values
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
#Encoding experience column
experience_mapper = {
    '<1'      :    0,'1'       :    1, '2'       :    2, '3'       :    3, '4'       :    4, '5'       :    5,
    '6'       :    6,'7'       :    7,'8'       :    8,  '9'       :    9, '10'      :    10, '11'      :    11,
    '12'      :    12,'13'      :    13, '14'      :    14, '15'      :    15, '16'      :    16,'17'      :    17,
    '18'      :    18,'19'      :    19, '20'      :    20, '>20'     :    21
} 
df.loc[:,'experience'] = df['experience'].map(experience_mapper)  

In [None]:
#Encoding rest of the columns
le = preprocessing.LabelEncoder() 
df['last_new_job']  = le.fit_transform(df['last_new_job']) 
df2=pd.get_dummies(df,columns=['gender','relevent_experience','enrolled_university','education_level','major_discipline','company_size','company_type'],drop_first=True)
df2.head()

In [None]:
#Dropping unnecessary columns
df2.drop(['enrollee_id','city'],axis=1,inplace=True)
df2.head()

In [None]:
#Checking if there is any imbalances in the dataset
df2.groupby('target')['target'].count()

**Observation:**
* Dataset is imbalanced. Lets use SMOTE technique to resolve this issue


In [None]:
X = df2.drop(['target'],axis=1)
y = df2['target']

sm = SMOTE(random_state=42)
X_smote,y_smote = sm.fit_sample(X,y)

count_class = Counter(y_smote)
print(count_class)

In [None]:
#Renaming this specific column because it contains '<' in the column name
X_smote.rename(columns={'company_size_<10':'company_size_10'}, inplace=True)

# Model Selection and Building

Comparison between SVM and XGboost

In [None]:
models = {
    'SVM':{'model':svm.SVC(gamma='auto',C=5,kernel='rbf'),'params': {'C': [1,5,10]}},
    'xgboost':{'model':xgb.XGBClassifier(),'params': {'max_depth':[4,6,8],'gamma': [0.5, 1, 2, 5]}}
}

In [None]:
#Hyperparameters tuning using GridSearchCV
scores = []

for model_name, mp in models.items():
    clf =  GridSearchCV(mp['model'],mp['params'] ,cv= 2, return_train_score=False)
    clf.fit(X_smote, y_smote)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df_model = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_model

**Observation**
* Xgboost performs well than SVM in this case

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_smote,y_smote,test_size=0.3, random_state=42)

In [None]:
model = xgb.XGBClassifier(gamma=0.5,max_depth=8)
model.fit(X_train,y_train)
ypred = model.predict(X_test)
print(classification_report(y_test,ypred))

In [None]:
print("Xgboost model accuracy - ", accuracy_score(y_test,ypred)*100)

In [None]:
#Plotting features importance
plt.rcParams["figure.figsize"] = (10, 7)
plot_importance(model)
plt.show()