# Import Libraries

Import all the required libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, log_loss, plot_roc_curve, auc, precision_recall_curve
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# read csv file
df = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

In [None]:
# first few rows
df.head()

In [None]:
# shape
print('Shape of train is {}'.format(df.shape))

In [None]:
# basic info
df.info()

In [None]:
# null values
df.isnull().sum()

Let's find what type of people are going to left the company

# Data Visalization

### Which gender is more likely to move for a new job?

In [None]:
sns.countplot(df['gender'])

In [None]:
gender = df[df['target'] == 1]['gender']
temp = gender.value_counts()
labels = temp.keys()
bar,ax = plt.subplots(figsize=(8,8))
plt.pie(x = temp, labels = labels , colors = ['blue','yellow','red'], autopct="%.2f%%",pctdistance=0.7)
plt.title('Gender % looking for new job', fontsize=20)

In [None]:
male_newjob = df[(df['gender']=='Male') & df['target']==1]
female_newjob = df[(df['gender']=='Female') & df['target']==1]

# print
print('{} % of male who are looking for a new job'.format(len(male_newjob)/len(df['gender']=='Male')*100))
print('{} % of female who are looking for a new job'.format(len(female_newjob)/len(df['gender']=='Female')*100))

### From which company type people are looking for new job?

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(df['company_type'])
plt.show()

From above we can see that most people work in private companies

In [None]:
company_type = df[df['target'] == 1]['company_type']
temp = company_type.value_counts()
labels = temp.keys()
bar,ax = plt.subplots(figsize=(8,8))
plt.pie(x = temp, labels = labels, autopct="%.1f%%",pctdistance=0.7)
plt.title('People leaving company', fontsize=20)

In [None]:
for i in df['company_type'].unique():
    company_newjob = df[(df['company_type']==i) & df['target']==1]
    print('{} % of {} who are looking for a new job'.format(len(company_newjob)/len(df['company_type']==i)*100,i))

So from above we can see that the maximum number of people who are looking for new job are from private companies

### People with relevant experience are looking for a new job?

In [None]:
sns.countplot(df['relevent_experience'])

In [None]:
sns.countplot(df['relevent_experience'],hue=df['target'])
plt.xlabel('target')
plt.ylabel('count')
plt.title('Relevent experience on the basis of target')

In [None]:
yes_newjob = df[(df['relevent_experience']=='Has relevent experience') & df['target']==1]
no_newjob = df[(df['relevent_experience']=='No relevent experience') & df['target']==1]

# print
print('{} % of having relevant experience who are looking for a new job'.format(len(yes_newjob)/len(df['relevent_experience']=='Has relevent experience')*100))
print('{} % of not havinf relevant experience who are looking for a new job'.format(len(no_newjob)/len(df['relevent_experience']=='No relevent experience')*100))

### Did any people got into data science field without having graduation degree?

In [None]:
sns.countplot(df['education_level'])

In [None]:
people_withoutdegree = df[(df['education_level'] == 'Primary School')& (df['education_level']=='High School') & (df['enrolled_university'] == "no_enrollment")]
print("People who have got into the data science world without graduation are", len(people_withoutdegree))

So their is not a single person who get into this field without graduation.

### Years between last and current job?

In [None]:
sns.countplot(df['last_new_job'])

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
count = Counter(df['last_new_job'])
plt.pie(count.values(), labels=count.keys(), labeldistance=0.75, autopct=lambda p:f'{p:.2f}%',
       explode=[0.05]+[0]*6, shadow=True)
plt.title('Number of years between last and current job', fontsize=20)
plt.show()

## Data Preprocessing

First of all we are going to drop unnecessary columns,so we don't require enrollee_id and city column

In [None]:
df.drop(['enrollee_id','city'],axis=1,inplace=True)

### Countplot for some categorical feature

We already have seen countplot for various features.Now,we are going to see countplot for the features we haven't seen yet

In [None]:
sns.countplot(df['enrolled_university'])

So it's seems like most people wjo are currently doing job haven't enrolled in any university

In [None]:
sns.countplot(df['major_discipline'])

In [None]:
sns.countplot(df['experience'])

Replace some row value just for looking it great

In [None]:
df.replace(to_replace = 'Has relevent experience',value = 'Yes',inplace = True)
df.replace(to_replace = 'No relevent experience',value='No',inplace = True )

df.replace(to_replace = '<1',value = '0',inplace = True)
df.replace(to_replace = '>20',value = '21',inplace=True)
df.replace(to_replace = 'never',value = '0',inplace=True)
df.replace(to_replace = '>4',value = '5',inplace=True)

df.replace(to_replace = '<10',value = 'around_10',inplace=True)
df.replace(to_replace = '10/49',value = 'around_50',inplace=True)
df.replace(to_replace = '50-99',value = 'around_100',inplace=True)
df.replace(to_replace = '100-500',value = 'around_500',inplace=True)
df.replace(to_replace = '500-999',value = 'around_1000',inplace=True)
df.replace(to_replace = '1000-4999',value = 'around_5000',inplace=True)
df.replace(to_replace = '5000-9999',value = 'around_10000',inplace=True)
df.replace(to_replace = '10000+',value = 'more_than_10000',inplace=True)

df.replace(to_replace = 'Full time course',value = 'Full_time_course',inplace=True)
df.replace(to_replace = 'Part time course',value = 'Part_time_course',inplace=True)

df.replace(to_replace = 'Primary School',value = 'Primary_School',inplace=True)
df.replace(to_replace = 'High School',value = 'High_School',inplace=True)

df.replace(to_replace = 'Business Degree',value = 'Business_Degree',inplace=True)
df.replace(to_replace = 'No Major',value = 'No_Major',inplace=True)

df.replace(to_replace = 'Pvt Ltd',value = 'Pvt_Ltd',inplace=True)
df.replace(to_replace = 'Funded Startup',value = 'Funded_Startup',inplace=True)
df.replace(to_replace = 'Public Sector',value = 'Public_Sector',inplace=True)
df.replace(to_replace = 'Early Stage Startup',value = 'Early_Stage_Startup',inplace=True)

df['major_discipline'].replace('Other','Other_major',inplace=True)
df['company_type'].replace('Other','Other_type',inplace=True)

### Deal with Null values

In [None]:
# null value
percent_null = df.isnull().sum()/df.shape[0]*100
print(percent_null)

Columns in which we have 2% or less than 2% null values we can drop those null values

In [None]:
df.dropna(subset=['enrolled_university','education_level','experience','last_new_job'], axis=0, inplace=True)

In [None]:
# after dropping those null values
df.shape

Now we are going to fill null values with their mode as all the columns left have dtype as 'object'

In [None]:
col_mode = ['gender','company_size','major_discipline','company_type','relevent_experience']
for col in col_mode:
    df[col].fillna(df[col].mode()[0],inplace=True)

Let's change the dtype of experience and last_new_job column

In [None]:
df = df.astype({'experience':int,'last_new_job':int})

### Handling Categorical Values

In [None]:
# get dummies

education_df = pd.get_dummies(df[['education_level']],drop_first=True,prefix=[None])
company_size_df = pd.get_dummies(df[['company_size']],drop_first=True,prefix=[None])
company_type_df = pd.get_dummies(df[['company_type']],drop_first=True,prefix=[None])
major_df = pd.get_dummies(df[['major_discipline']],drop_first=True,prefix=[None])
university_df = pd.get_dummies(df[['enrolled_university']],drop_first=True,prefix=[None])
experience_df = pd.get_dummies(df[['relevent_experience']],drop_first=True,prefix=[None])
gender_df = pd.get_dummies(df[['gender']],drop_first=True,prefix=[None])

In [None]:
# drop original columns
df.drop(['education_level','company_size','company_type','major_discipline','enrolled_university','relevent_experience','gender'],axis=1,inplace=True)

In [None]:
final_df = pd.concat([df,education_df,company_size_df,company_type_df,major_df,university_df,experience_df,gender_df],axis=1) 

In [None]:
final_df.head()

In [None]:
final_df.to_csv('final_df.csv')

# Model

In [None]:
X = final_df.drop(['target'], axis = 1)
Y = final_df['target']

Let's see weather our dataset is balanced or imbalanced

In [None]:
sns.countplot(df['target'])


We can see that our dataset is imbalanced dataset.We are going to use smote technique to deal with our imablanced dataset

In [None]:
smote = SMOTE(random_state = 402)
X_smote, Y_smote = smote.fit_resample(X,Y)


sns.countplot(Y_smote)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_smote, Y_smote, test_size = 0.2 ,random_state = 42)

We need to normalize our dataset so it will not get bias towards only particular feature

In [None]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_val=sc.fit(X_val)

## Hyperparameter Tuning and RandomCV

In [None]:
clf = XGBClassifier()

# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

random_cv=RandomizedSearchCV(estimator=clf,param_distributions=params,
                             cv=5,n_iter=5,scoring='roc_auc',n_jobs=1,verbose=3,return_train_score=True,random_state=121)
random_cv.fit(X_train,y_train)

In [None]:
#best parameter 
random_cv.best_params_

## Train

In [None]:
clf = XGBClassifier(colsample_bytree= 0.8,
 gamma= 1.5,
 max_depth= 5,
 min_child_weigh= 1,
 subsample= 0.6)

clf.fit(X_train, y_train)

In [None]:
# score
clf.score(X_train,y_train)

# Heroku App

So if you are a HR and wanted to predict weather a person will going to leave a new job or he/she is looking for a new job just click on this link:https://looking-for-job-change.herokuapp.com/

### If you like this notebook don't forget to upvote it