In [None]:
# Reading The Data

In [None]:
from copy import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn import metrics
%matplotlib inline

In [None]:
data = pd.read_csv('../input/hr-dataset/train.csv')
data.head()

In [None]:
data.info()

In [None]:
data.duplicated().sum()

In [None]:
plt.figure(figsize=(10,13))
sns.heatmap(data.isnull(), cbar = False)
plt.yticks([])
plt.show()

In [None]:
data.isnull().sum()

In [None]:
round((data.isnull().sum() / data.shape[0])*100, 2)

In [None]:
print('Number of Rows Containing 4 NULLs or more:', (data.shape[0] - data.dropna(thresh=10).shape[0]))
print('Percentage:', round(((data.shape[0] - data.dropna(thresh=10).shape[0])/data.shape[0])*100, 2), '%')

In [None]:
data.dropna(thresh=10, inplace=True)

In [None]:
data.company_size.replace('10/49', '10-49', inplace=True)

In [None]:
data.company_size.replace('100-500', '100-499', inplace=True)

In [None]:
data.shape

# Predicting NULL Values

- ## The Columns to predict their null values
    - enrolled_university
    - education_level
    - major_discipline
    - experience
    - company_size
    - company_type
    - last_new_job
    - gender

In [None]:
data.isnull().sum()

- ## Getting the training data ready

In [None]:
training_data = data.dropna(thresh=14)
training_data.shape

In [None]:
training_data.drop('enrollee_id', axis=1, inplace=True);

In [None]:
training_data_2 = copy(training_data)

In [None]:
training_data.head()

In [None]:
encoding_columns = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience',
                    'company_size', 'company_type', 'last_new_job', 'target']
encoder = {}
for column in encoding_columns:
    encoder[column] = LabelEncoder()
    training_data_2[column] = encoder[column].fit_transform(training_data_2[column])

In [None]:
training_data_2.head()

In [None]:
plt.figure(figsize=(18,9.5))
sns.heatmap(training_data_2.corr(),cbar = True, annot =True, cmap="Blues");

- ## Normalizing Training data

In [None]:
def normalize_columns(column):
    return (training_data_2[column] - training_data_2[column].min()) / (training_data_2[column].max() - training_data_2[column].min())

In [None]:
for column in training_data.columns:
    training_data_2[column] = normalize_columns(column)

In [None]:
training_data_2.head()

- ## `1.`Enrolled University Column

In [None]:
columns = ['city_development_index', 'relevent_experience', 'education_level', 'major_discipline', 'experience',
                    'company_type', 'last_new_job', 'training_hours', 'target']

In [None]:
X = training_data_2.loc[:,columns].values
Y = training_data.enrolled_university

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=30)

In [None]:
rf_grid = {
 'max_depth': [10, 20, 30, None],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [200, 400, 600]}

In [None]:
from sklearn.ensemble import RandomForestClassifier

eu_classifier = GridSearchCV(estimator= RandomForestClassifier(), param_grid=rf_grid, cv = 3, n_jobs=-1,verbose=1)
eu_classifier.fit(x_train, y_train) 

# Summarize results
print("Best: %f using %s" % (eu_classifier.best_score_, eu_classifier.best_params_))

In [None]:
eu_predictions = eu_classifier.predict(x_test) 
   
# print classification report 
print(metrics.classification_report(y_test, eu_predictions))

In [None]:
accuracy = eu_classifier.score(x_train, y_train)
print('Training Accuracy:', round(accuracy *100, 2), '%')

accuracy = eu_classifier.score(x_test, y_test)
print('Testing Accuracy:', round(accuracy *100, 2), '%')

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Confusion Matrx')
chart = metrics.plot_confusion_matrix(eu_classifier, x_test, y_test, display_labels= encoder['enrolled_university'].classes_, ax = ax, cmap="Blues", values_format = '.0f')
chart.confusion_matrix;

- ## `2` Major Discipline

In [None]:
columns = ['city', 'city_development_index', 'enrolled_university', 'relevent_experience', 'last_new_job', 'target']

In [None]:
X = training_data_2.loc[:,columns].values
Y = training_data.major_discipline

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=30)

In [None]:
# Random Forest
md_classifier = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 42)
md_classifier.fit(x_train, y_train) 

accuracy = md_classifier.score(x_train, y_train)
print('Training Accuracy:', round(accuracy *100, 2), '%')

accuracy = md_classifier.score(x_test, y_test)
print('Testing Accuracy:', round(accuracy *100, 2), '%')

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Confusion Matrx')
chart = metrics.plot_confusion_matrix(md_classifier, x_test, y_test, display_labels= encoder['major_discipline'].classes_, ax = ax, cmap="Blues", values_format='.0f')
chart.confusion_matrix;

  - ## Filling Enrolled University Column

In [None]:
df_3 = copy(data)

In [None]:
df_3.fillna('0', inplace=True)

In [None]:
x_test = df_3[df_3['enrolled_university'] == '0']
x_test.shape

In [None]:
columns = ['enrollee_id', 'city_development_index', 'relevent_experience', 'education_level', 'major_discipline', 'experience',
                    'company_type', 'last_new_job', 'training_hours', 'target']

x_test = x_test.loc[:, columns]
x_test.head()

In [None]:
x_test.shape

In [None]:
encoding_columns_eu = [ 'relevent_experience', 'education_level', 'major_discipline', 'experience', 'company_type', 'last_new_job', 'target']
eu_encoder = {}
for column in encoding_columns_eu:
    encoder[column] = LabelEncoder()
    x_test[column] = encoder[column].fit_transform(x_test[column])

In [None]:
x_test.head()

In [None]:
def normalize_columns(column):
    return (x_test[column] - x_test[column].min()) / (x_test[column].max() - x_test[column].min())

In [None]:
for column in x_test.columns[1:]:
    x_test[column] = normalize_columns(column)

In [None]:
x_test.head()

In [None]:
enrolled_university = copy(x_test.enrollee_id)

In [None]:
enrolled_university = pd.DataFrame({'enrollee_id':enrolled_university, 'enrolled_university':eu_classifier.predict(x_test.iloc[:,1:])})

In [None]:
enrolled_university.head()

In [None]:
for i in enrolled_university.enrollee_id.unique():
  df_3.loc[df_3.enrollee_id == i, 'enrolled_university'] = enrolled_university.query(f'enrollee_id == {i}').enrolled_university

In [None]:
df_3.enrolled_university.unique()

- ## Filling Major Discipline Column

In [None]:
x_test = df_3[df_3['major_discipline'] == '0']
x_test.shape

In [None]:
columns = ['enrollee_id', 'city', 'city_development_index', 'enrolled_university', 'relevent_experience', 'last_new_job', 'target']

x_test = x_test.loc[:, columns]
x_test.head()

In [None]:
x_test.shape

In [None]:
encoding_columns_md = [ 'city', 'enrolled_university', 'relevent_experience', 'last_new_job', 'target']
md_encoder = {}
for column in encoding_columns_md:
    encoder[column] = LabelEncoder()
    x_test[column] = encoder[column].fit_transform(x_test[column])

In [None]:
x_test.head()

In [None]:
def normalize_columns(column):
    return (x_test[column] - x_test[column].min()) / (x_test[column].max() - x_test[column].min())

In [None]:
for column in x_test.columns[1:]:
    x_test[column] = normalize_columns(column)

In [None]:
x_test.head()

In [None]:
major_discipline = copy(x_test.enrollee_id)

In [None]:
major_discipline = pd.DataFrame({'enrollee_id':major_discipline, 'major_discipline':md_classifier.predict(x_test.iloc[:,1:])})

In [None]:
major_discipline.head()

In [None]:
for i in major_discipline.enrollee_id.unique():
  df_3.loc[df_3.enrollee_id == i, 'major_discipline'] = major_discipline.query(f'enrollee_id == {i}').major_discipline

In [None]:
df_3.major_discipline.unique()

In [None]:
df = copy(df_3)

In [None]:
df.head()

In [None]:
df.replace('0', 'Unknown', inplace=True)

# Exploratory Data Analysis

## Question to be asked about the Data:
Q1: All columns vs Target<br>
Q2: Distribution data of all columns<br>
Q3: What are the three most important things we need to focus on to improve employee
engagement?<br>
Q4: What are the three most important things we need to focus on to decrease the numer looking for a new job?<br>
Q5: What are the trainee most effective features that does not look for another job?<br>
Q6: what is the effect of training hours that causing the trainee to look for another job?<br>

In [None]:
color = sns.color_palette('muted')
color_2 = sns.color_palette("Paired")
color_3 = sns.color_palette('pastel')
color_3

In [None]:
# Null Values
null_values = [5039, 4779, 4089, 2838, 457, 342, 367, 59]
columns_null_values = ['company_type', 'company_size', 'gender', 'major_discipline', 'enrolled_university', 'education_level', 'last_new_job', 'experience']

plt.figure(figsize=(13,8))

plt.title('Null Values', fontweight="bold")
plt.xlabel('Column', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
sns.barplot(x=columns_null_values, y=null_values, color=color[0]);
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.xticks((0,1,2,3,4,5,6, 7),('Company Type', 'Company Size', 'Gender', 'Major Discipline', 'Enrolled University', 'Education Level', 'Last_New Job', 'Experience'), rotation=15 );

for i in range(8):
  plt.text(i, null_values[i]+50, null_values[i], ha='center')

plt.savefig('null.png')

- ## Univariate Charts

In [None]:
plt.figure(figsize=(13,8))
sns.histplot(data=df, x='city_development_index', color=color[9]);

plt.title('City Development Index Distribution', fontweight="bold")
plt.xlabel('City Development Index', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
city_10 = df['city'].value_counts()[:11]

plt.figure(figsize=(13,8))
sns.barplot(x=city_10, y=city_10.index, color=color[9]);
#plt.xaxis.set_visible(False)
plt.xlabel('Count')
plt.ylabel('City')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.title('Top 10 Cities', fontweight="bold")
plt.xlabel('Count', fontweight="bold")
plt.ylabel('City', fontweight="bold")
plt.xticks([])
for i in range(len(city_10)):
    plt.text(city_10[i]+50, i, city_10[i])

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='relevent_experience', color=color[9]);
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Relevant Experience', fontweight="bold")
plt.xlabel('Relevant Experience', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
order = ['Male', 'Female', 'Other', 'Unknown']
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='gender', color=color[9], order=order);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Gender', fontweight="bold")
plt.xlabel('Gender', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='enrolled_university', color=color[9]);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Enrolled University', fontweight="bold")
plt.xlabel('Enrolled University', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
plt.figure(figsize=(8,9))
order = ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd', 'Unknown']

sns.countplot(data=df, x='education_level', color=color[1], order=order);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Education Level', fontweight="bold")
plt.xlabel('Education Level', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='major_discipline', color=color[1], order=df.major_discipline.value_counts().index);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Major Discipline', fontweight="bold")
plt.xlabel('Major Discipline', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
order = ['<1']
for i in range(20):
  order.append(str(i+1))
order.append('>20')
order.append('Unknown')

plt.figure(figsize=(15,8));
sns.countplot(data=df, x='experience', color=color[1], order=order);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Experience Per Count of trainees', fontweight="bold")
plt.xlabel('Experience', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
plt.figure(figsize=(8,9))
order = ['<10', '10-49', '50-99', '100-499', '500-999', '1000-4999', '5000-9999', '10000+', 'Unknown']
sns.countplot(data=df, x='company_size', color=color[1], order=order);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Company Size', fontweight="bold")
plt.xlabel('Company Size', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.xticks(rotation=20);

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='company_type', color=color[1], order=df.company_type.value_counts().index);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Company Type', fontweight="bold")
plt.xlabel('Company Type', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.xticks(rotation=15);

In [None]:
plt.figure(figsize=(8,9))
order = ['never', '1', '2', '3', '4', '>4', 'Unknown']
labels = ('0','1','2','3','4','>4', 'Unknown')
sns.countplot(data=df, x='last_new_job', color=color[9], order=order);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Last New Job', fontweight="bold")
plt.xlabel('Last New Job', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.xticks((0,1,2,3,4,5,6),labels);

In [None]:
plt.figure(figsize=(8,9))
sns.histplot(data=df, x='training_hours', bins=30, color=color[9]);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Training Hours', fontweight="bold")
plt.xlabel('Training Hours (hr)', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
plt.figure(figsize=(8,9))
sns.kdeplot(data=df, x='training_hours', color=color[9]);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Training Hours', fontweight="bold")
plt.xlabel('Training Hours (hr)', fontweight="bold")
plt.ylabel('Count', fontweight="bold");

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='target', color=color[9]);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Target', fontweight="bold")
plt.xlabel('Target', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.xticks((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

- ## Bivariate Charts

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='relevent_experience', hue='target');

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Relevant Experience vs. Target', fontweight="bold")
plt.xlabel('Relevant Experience', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.legend((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));
plt.savefig('13.png')

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='gender', hue='target', order=['Male', 'Female', 'Other', 'Unknown']);
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Gender vs. Target', fontweight="bold")
plt.xlabel('Gender', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.legend((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='enrolled_university', hue='target');
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Enrolled University vs. Target', fontweight="bold")
plt.xlabel('Enrolled University', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.legend((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(8,9))
order = ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd', 'Unknown']

sns.countplot(data=df, x='education_level', hue='target', order=order);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Education Level vs. Target', fontweight="bold")
plt.xlabel('Education Level', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.legend((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='major_discipline', hue='target', order=df.major_discipline.value_counts().index);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Major Discipline vs. Target', fontweight="bold")
plt.xlabel('Major Discipline', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.legend((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(8,9))
sns.countplot(data=df, x='company_type', hue='target', order=df.company_type.value_counts().index);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Company Type vs. Target', fontweight="bold")
plt.xlabel('Company Type', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.xticks(rotation=15);
plt.legend((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
order = ['<1']
for i in range(20):
  order.append(str(i+1))
order.append('>20')
order.append('Unknown')

plt.figure(figsize=(16,7))
sns.countplot(data=df, x='experience', hue='target', order=order);
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Trainee Experience vs. Target', fontweight="bold")
plt.xlabel('Trainee Experience', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.legend(labels=('Not Looking for A Job', 'Looking for A Job'));
plt.savefig('15.png')

In [None]:
plt.figure(figsize=(9,7))
order = ['<10', '10-49', '50-99', '100-499', '500-999', '1000-4999', '5000-9999', '10000+', 'Unknown']

sns.countplot(data=df, x='company_size', hue='target', order=order);
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Company Size vs. Target', fontweight="bold")
plt.xlabel('Company Size', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.xticks(rotation=15);
plt.legend((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(8,9))
order = ['never', '1', '2', '3', '4', '>4', 'Unknown']
sns.countplot(data=df, x='last_new_job', hue='target', order=order);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Last New Job vs. Target', fontweight="bold")
plt.xlabel('Last New Job', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.legend((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(8,9))
sns.violinplot(data=df, y='city_development_index', x='target', inner='quartile');

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('City Development Index vs. Target', fontweight="bold")
plt.xlabel('Target', fontweight="bold")
plt.ylabel('City Development Index', fontweight="bold");
plt.xticks((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(8,9))
sns.boxplot(data=df, y='city_development_index', x='target');

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('City Development Index vs. Target', fontweight="bold")
plt.xlabel('Target', fontweight="bold")
plt.ylabel('City Development Index', fontweight="bold");
plt.xticks((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(8,9))
sns.violinplot(data=df, y='training_hours', x='target');

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Training Hours vs. Target', fontweight="bold")
plt.xlabel('Target', fontweight="bold")
plt.ylabel('Training Hours (hr)', fontweight="bold");
plt.xticks((0,1), labels=('Not Looking for A Job', 'Looking for A Job'));

In [None]:
plt.figure(figsize=(7,8));
order = ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd', 'Unknown']
sns.swarmplot(data=df, x='education_level', y='training_hours', order=order, palette=color);

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Trainee Education Level vs. Training Hours', fontweight="bold")
plt.xlabel('Education Level', fontweight="bold")
plt.ylabel('Training Hours', fontweight="bold");
plt.xticks(rotation=15);
plt.savefig('swarm.png')

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='education_level', hue='relevent_experience');

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.title('Trainee Education Level vs. Relevant Experience', fontweight="bold")
plt.xlabel('Education Level', fontweight="bold")
plt.ylabel('Count', fontweight="bold");
plt.xticks(rotation=15);

# Creating The Model Using SMOTE (Replacing NULL with 0)

- ## Encoding The Data

In [None]:
df_2 = copy(data)

In [None]:
df_2.fillna('0', inplace=True)

In [None]:
df_2.drop('enrollee_id', axis=1, inplace=True);
df_2.head()

In [None]:
encoding_columns = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience',
                    'company_size', 'company_type', 'last_new_job', 'target']
encoder = {}
for column in encoding_columns:
    encoder[column] = LabelEncoder()
    df_2[column] = encoder[column].fit_transform(df_2[column])

In [None]:
plt.figure(figsize=(18,9.5))
sns.heatmap(df_2.corr(),cbar = True, annot =True, cmap="Blues");

- ## Normalizing Data 

In [None]:
def normalize_columns(column):
    return (df_2[column] - df_2[column].min()) / (df_2[column].max() - df_2[column].min())

In [None]:
for column in df_2.columns:
    df_2[column] = normalize_columns(column)

In [None]:
df_2.head()

- ## XGBoost Classifier Using SMOTE Replacing NULL with *0*

In [None]:
columns = ['city', 'city_development_index', 'relevent_experience', 'enrolled_university', 'education_level', 
           'major_discipline', 'experience', 'company_size', 'last_new_job', 'training_hours']

In [None]:
X = df_2.loc[:,columns].values
Y = df_2.target.values

In [None]:
X.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [None]:
sm = SMOTE(random_state=2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [None]:
xgb_grid = {
            'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5],
            'subsample': [0.6, 0.8],
            'colsample_bytree': [0.6, 0.8],
            'max_depth': [3, 4, 5]
        }

In [None]:
xgb_classifier =  GridSearchCV(estimator= XGBClassifier(), param_grid=xgb_grid, cv = 3, n_jobs=-1,verbose=1)
xgb_classifier.fit(x_train_res, y_train_res) 

# Summarize results
print("Best: %f using %s" % (xgb_classifier.best_score_, xgb_classifier.best_params_))

In [None]:
xgb_classifier.best_params_

In [None]:
accuracy = xgb_classifier.score(x_train, y_train)
print('Testing Accuracy:', round(accuracy *100, 2), '%')

In [None]:
accuracy = xgb_classifier.score(x_test, y_test)
print('Testing Accuracy:', round(accuracy *100, 2), '%')

In [None]:
xgb_predictions = xgb_classifier.predict(x_test) 
   
# print classification report 
print(metrics.classification_report(y_test, xgb_predictions))

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Confusion Matrx')
chart = metrics.plot_confusion_matrix(xgb_classifier, x_test, y_test, display_labels= encoder['target'].classes_, ax = ax, cmap="Blues",  values_format = '.0f')
chart.confusion_matrix;
plt.savefig('cm_1.png')

In [None]:
metrics.plot_roc_curve(xgb_classifier, x_test, y_test)
plt.savefig('auc_1.png')

#Creating The Model Using SMOTE (with Predicted NULL Values and Unknown) 


- ## Encoding The Data

In [None]:
df_3.drop('enrollee_id', axis=1, inplace=True);
df_3.head()

In [None]:
df_3.shape

In [None]:
encoding_columns = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience',
                    'company_size', 'company_type', 'last_new_job', 'target']

encoder = {}
for column in encoding_columns:
    encoder[column] = LabelEncoder()
    df_3[column] = encoder[column].fit_transform(df_3[column])

In [None]:
plt.figure(figsize=(18,9.5))
sns.heatmap(df_3.corr(),cbar = True, annot =True, cmap="Blues");

- ## Normalizing The Data

In [None]:
def normalize_columns(column):
    return (df_3[column] - df_3[column].min()) / (df_3[column].max() - df_3[column].min())

In [None]:
for column in df_3.columns:
    df_3[column] = normalize_columns(column)

In [None]:
df_3.head()

- ## XGBoost Classifier Using SMOTE with Predicted NULL Values

In [None]:
columns = ['city', 'city_development_index', 'relevent_experience', 'enrolled_university', 'education_level', 
           'major_discipline', 'experience', 'company_size', 'last_new_job', 'training_hours']

In [None]:
X = df_3.loc[:,columns].values
Y = df_3.target.values

In [None]:
X.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [None]:
sm = SMOTE(random_state=2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [None]:
xgb_grid = {
            'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5],
            'subsample': [0.6, 0.8],
            'colsample_bytree': [0.6, 0.8],
            'max_depth': [3, 4, 5]
        }

In [None]:
xgb_2_classifier =  GridSearchCV(estimator= XGBClassifier(), param_grid=xgb_grid, cv = 3, n_jobs=-1,verbose=1)
xgb_2_classifier.fit(x_train_res, y_train_res) 

# Summarize results
print("Best: %f using %s" % (xgb_2_classifier.best_score_, xgb_2_classifier.best_params_))

In [None]:
xgb_2_classifier.best_params_

In [None]:
accuracy = xgb_2_classifier.score(x_train, y_train)
print('Training Accuracy:', round(accuracy *100, 2), '%')

In [None]:
accuracy = xgb_2_classifier.score(x_test, y_test)
print('Testing Accuracy:', round(accuracy *100, 2), '%')

In [None]:
xgb_2_predictions = xgb_2_classifier.predict(x_test) 
   
# print classification report 
print(metrics.classification_report(y_test, xgb_2_predictions))

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Confusion Matrx')
chart = metrics.plot_confusion_matrix(xgb_2_classifier, x_test, y_test, display_labels= encoder['target'].classes_, ax = ax, cmap="Blues",  values_format = '.0f')
chart.confusion_matrix;
plt.savefig('cm_2.png')

In [None]:
metrics.plot_roc_curve(xgb_2_classifier, x_test, y_test)
plt.savefig('auc_2.png')

# Creating The Model Using SMOTE(With Predicted NULL Values and Mode)

- ## Scaling The Data

In [None]:
df_4 = copy(data)
df_4.head()

In [None]:
null_columns = ['gender','enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job' ]

for i in null_columns:
  df_4[i].fillna(df_4[i].mode()[0], inplace=True)

In [None]:
df_4.drop('enrollee_id', axis=1, inplace=True);
df_4.head()

In [None]:
encoding_columns = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience',
                    'company_size', 'company_type', 'last_new_job', 'target']

encoder = {}
for column in encoding_columns:
    encoder[column] = LabelEncoder()
    df_4[column] = encoder[column].fit_transform(df_4[column])

In [None]:
plt.figure(figsize=(18,9.5))
sns.heatmap(df_4.corr(),cbar = True, annot =True, cmap="Blues");

- ## Normalizing The Data

In [None]:
def normalize_columns(column):
    return (df_4[column] - df_4[column].min()) / (df_4[column].max() - df_4[column].min())

In [None]:
for column in df_4.columns:
    df_4[column] = normalize_columns(column)

In [None]:
df_4.head()

- ## XGBoost Classifier Using SMOTE(Replacing NULL Values with Mode)

In [None]:
columns = ['city', 'city_development_index', 'relevent_experience', 'enrolled_university', 'education_level', 
           'major_discipline', 'experience', 'company_size', 'last_new_job', 'training_hours']

In [None]:
X = df_4.loc[:,columns].values
Y = df_4.target.values

In [None]:
X.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [None]:
sm = SMOTE(random_state=2)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [None]:
xgb_grid = {
            'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5],
            'subsample': [0.6, 0.8],
            'colsample_bytree': [0.6, 0.8],
            'max_depth': [3, 4, 5]
        }

In [None]:
xgb_3_classifier =  GridSearchCV(estimator= XGBClassifier(), param_grid=xgb_grid, cv = 3, n_jobs=-1,verbose=1)
xgb_3_classifier.fit(x_train_res, y_train_res) 

# Summarize results
print("Best: %f using %s" % (xgb_3_classifier.best_score_, xgb_3_classifier.best_params_))

In [None]:
xgb_3_classifier.best_params_

In [None]:
accuracy = xgb_3_classifier.score(x_train, y_train)
print('Training Accuracy:', round(accuracy *100, 2), '%')

In [None]:
accuracy = xgb_3_classifier.score(x_test, y_test)
print('Testing Accuracy:', round(accuracy *100, 2), '%')

In [None]:
xgb_3_predictions = xgb_3_classifier.predict(x_test) 
   
# print classification report 
print(metrics.classification_report(y_test, xgb_3_predictions))

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Confusion Matrx')
chart = metrics.plot_confusion_matrix(xgb_3_classifier, x_test, y_test, display_labels= encoder['target'].classes_, ax = ax, cmap="Blues",  values_format = '.0f')
chart.confusion_matrix;
plt.savefig('cm_3.png')

In [None]:
metrics.plot_roc_curve(xgb_3_classifier, x_test, y_test)
plt.savefig('auc_3.png')

# Predicting The Target in Test Data 

- ## Detting The Data Ready

In [None]:
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HR_Analysis/test.csv')
test.head()

In [None]:
test.info()

In [None]:
test.isnull().sum()

In [None]:
null_columns = ['gender','enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job' ]

for i in null_columns:
  test[i].fillna(test[i].mode()[0], inplace=True)

In [None]:
test.head()

- ## Scaling The test data

In [None]:
test.drop(['enrollee_id', 'gender', 'company_type'], axis=1, inplace=True)

In [None]:
encoding_columns = ['city', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience',
                    'company_size', 'last_new_job']

encoder = {}
for column in encoding_columns:
    encoder[column] = LabelEncoder()
    test[column] = encoder[column].fit_transform(test[column])

- ## Normalizing Test Data

In [None]:
def normalize_columns(column):
    return (test[column] - test[column].min()) / (test[column].max() - test[column].min())

In [None]:
for column in test.columns:
    test[column] = normalize_columns(column)

In [None]:
test.head()

- ## Predicting Target

In [None]:
test['target'] = xgb_3_classifier.predict(test.values)

In [None]:
test.target.value_counts().plot.pie(colors=color, autopct='%1.0f%%')