# HR Analytics

A company which is active in Big Data and Data Science wants to hire data scientists among people who successfully pass some courses which conduct by the company. Many people signup for their training. Company wants to know which of these candidates are really wants to work for the company after training or looking for a new employment because it helps to reduce the cost and time as well as the quality of training or planning the courses and categorization of candidates. Information related to demographics, education, experience are in hands from candidates signup and enrollment.

![](https://www.digitalvidya.com/wp-content/uploads/2019/05/HR-Analytics.jpg)
### Features

* enrollee_id : Unique ID for candidate

* city: City code

* city_ development _index : Developement index of the city (scaled)

* gender: Gender of candidate

* relevent_experience: Relevant experience of candidate

* enrolled_university: Type of University course enrolled if any

* education_level: Education level of candidate

* major_discipline :Education major discipline of candidate

* experience: Candidate total experience in years

* company_size: No of employees in current employer's company

* company_type : Type of current employer

* lastnewjob: Difference in years between previous job and current job

* training_hours: training hours completed

* target: 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.isna().sum()

### Converting objects into categories

In [None]:
train.gender = train.gender.astype('category')

train.enrolled_university = train.enrolled_university.astype('category')

train.relevent_experience = train.relevent_experience.astype('category')

train.education_level = train.education_level.astype('category')

train.major_discipline = train.major_discipline.astype('category')

### filling NaN values with relevant values

In [None]:
train.enrolled_university = train.enrolled_university.fillna('no_enrollment')

train.education_level = train.education_level.fillna('Graduate')

train.major_discipline = train.major_discipline.fillna('STEM')

train.experience = train.experience.fillna('>20')

train.last_new_job = train.last_new_job.fillna('1')

train.company_type = train.company_type.fillna(pd.Series(np.random.choice(['Pvt Ltd','Funded Startup','Public Sector'],p=[0.7,0.2,0.1], size=len(train))))

train.company_size = train.company_size.fillna(pd.Series(np.random.choice(['50-99','100-500','10000+','10/49','1000-4999','<10'],p=[0.3,0.25,0.2,0.15,0.05,0.05], size=len(train))))

train.gender = train.gender.fillna(pd.Series(np.random.choice(['Male','Female','Other'],p=[0.9,0.09,0.01], size=len(train))))

In [None]:
train.info()

# Data Visualization

In [None]:
sb.boxplot(x="enrolled_university", y="city_development_index",
            hue="target", palette="autumn",
            data=train)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
sb.displot(
    data=train,
    x="training_hours", hue="last_new_job",
    kind="kde", height=6,palette='gist_earth',
    multiple="fill", clip=(0, None))
plt.title('')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sb.histplot(
    train,
    x="training_hours", hue="relevent_experience",
    multiple="stack",
    palette="rocket_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
f, ax = plt.subplots()
sb.despine(bottom=True, left=True)

# Show each observation with a scatterplot
sb.stripplot(x="city_development_index", y="experience", hue="target",
              data=train, dodge=True, alpha=.25, zorder=1)

# Show the conditional means
sb.pointplot(x="city_development_index", y="major_discipline", hue="target",
              data=train, dodge=.532, join=False, palette="dark",
              markers="d", scale=.75, ci=None)

# Improve the legend 
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[3:], labels[3:], title="target",
          handletextpad=0, columnspacing=1,
          loc="lower right", ncol=3, frameon=True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
plt.figure(figsize=(15,5))
sb.barplot(data=train,y='training_hours',x='education_level',hue='target',palette='hot')
plt.show()

In [None]:
plt.figure(figsize=(15,8))
sb.stripplot(data=train, x="training_hours", y="company_size", hue="target")
plt.show()

In [None]:
fig = px.sunburst(train, path=['gender','last_new_job','relevent_experience','target'],color='last_new_job')
fig.show()

In [None]:
fig = px.area(train, x="training_hours", y="city_development_index", color="target",line_group="relevent_experience")
fig.show()

In [None]:
# parallel categories of train data
fig = px.parallel_categories(train,dimensions=['relevent_experience', 'gender','education_level','enrolled_university','company_type','target']
                            ,color="target")
fig.show()

## Model building

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import *

In [None]:
model_data = train.copy()

In [None]:
x=['city_development_index','relevent_experience','education_level']
y=['target']

In [None]:
model_data.relevent_experience,_ = pd.factorize(model_data.relevent_experience)
model_data.education_level,_ = pd.factorize(model_data.education_level)

In [None]:
model_data.corr()

In [None]:
x_train,x_test,y_train,y_test = train_test_split(model_data[x],model_data[y],random_state=39,test_size=0.3)

## Random Forest Classifier

In [None]:
ran = RandomForestClassifier()

In [None]:
ran.fit(x_train,np.ravel(y_train,order='C'))

In [None]:
ran_pred = ran.predict(x_test)

In [None]:
print(accuracy_score(y_test,ran_pred))

In [None]:
print(classification_report(y_test,ran_pred))

### Random forest predicts with 78% accuracy

## Logistic Regression

In [None]:
log = LogisticRegression()

In [None]:
log.fit(x_train,np.ravel(y_train,order='C'))

In [None]:
log_pred = log.predict(x_test)

In [None]:
print(accuracy_score(y_test,log_pred))

In [None]:
print(classification_report(y_test,log_pred))

### Logistic regression predicts with 77% accuracy

## The model finally predicts the data based on user input

In [None]:
if (ran.predict([[0.20,0,4]])[0] == 1):
    print('1 – Looking for a job change')
else:
    print('0 – Not looking for job change')

## Predicting target variable for test.csv

In [None]:
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

In [None]:
test = test[['enrollee_id','city_development_index','relevent_experience','education_level']]

In [None]:
test.relevent_experience,_ = pd.factorize(test.relevent_experience)
test.education_level,_ = pd.factorize(test.education_level)

In [None]:
test.head()

In [None]:
final_predictions = ran.predict(test[['city_development_index','relevent_experience','education_level']])

In [None]:
print(final_predictions)

In [None]:
output = pd.DataFrame({'enrollee_id': test.enrollee_id, 'target': final_predictions})

In [None]:
output.head()

In [None]:
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

## If you like, an upvote would be deeply appreciated. Thanks! :)