# HR Analytics

---
**1. Importing the necessary libraries**

In [None]:
# To prevent the annoying warning from scikit learn package
import warnings 
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')
cmap = sns.cm.mako_r

%matplotlib inline

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

**2. Importing the Data using Pandas read_csv(). And calling head() and info() on the DataFrame**

In [None]:
aug_train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

In [None]:
aug_train.head()

In [None]:
aug_train.info()

---
**3. Calling isna() and checking the total number of Null Data**

In [None]:
aug_train.isna().sum()

As we can see that there are a lot of missing values and most of them are Categorical and it is not feasible to fill those values as we may end up providing completly wrong data to the training set. So, I prefer dropping them instead.

In [None]:
aug_train.dropna(inplace=True)

**4. As most of the Data is Categorical we plot Bar/Pie to visualize the Categorical Data** 

In [None]:
# Function to Plot Bar Graph
def plot_bar_graph(column, order=None):
    x = aug_train[column].value_counts().index
    y = aug_train[column].value_counts().values
    sns.barplot(x, y, order=order)
     
# Funtion to Plot Pie Chart
def plot_pie_chart(column, title=''):
    values = aug_train[column].value_counts().values
    labels = aug_train[column].value_counts().index
    plt.figure(figsize=(6,6))
    plt.title(title)
    plt.pie(x=values, labels=labels, autopct='%1.2f%%')

1. Pie Chart for Gender

In [None]:
plot_pie_chart('gender', title='Pie Chart For Gender')
plt.show()

As we can see that most of the Candidates working are Male

---
2. Pie Chart for Relevent Experience

In [None]:
plot_pie_chart('relevent_experience', title='Pie Chart For Relevent Experience')
plt.show()

---
3. Enrolled University

In [None]:
plot_pie_chart('enrolled_university', title='Pie Chart For Enrolled University')
plt.show()

---
4. Education Level

In [None]:
plot_pie_chart('education_level', title='Pie Chart For Education Level')
plt.show()

---
5. Major Discipline

In [None]:
plot_bar_graph('major_discipline')
plt.show()

Most of the Candidates are from STEM. That is their major discipline was in one of the Following:<br>
<ol>
    <li><span style='color:red'>S</span>cience</li>
    <li><span style='color:red'>T</span>echnology</li>
    <li><span style='color:red'>E</span>ngineering</li>
    <li><span style='color:red'>M</span>athematics</li>
</ol>

---
6. Company Size

In [None]:
plt.figure(figsize=(8, 4))
order = ['<10', '10/49', '50-99', '100-500', '500-999', '1000-4999', '5000-9999', '10000+']
plot_bar_graph('company_size', order=order)
plt.show()

---
7. Company Type

In [None]:
plt.figure(figsize=(9, 5))
plot_bar_graph('company_type')
plt.show()

---
8. How many Jobs Candidates Worked before joining the current company

In [None]:
plt.figure(figsize=(9, 5))
order = ['never', '1', '2', '3', '4', '>4']
plot_bar_graph('last_new_job', order=order)
plt.show()

---
9. Let's play a Histogram to see the Distribution of the Training Hours

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(aug_train['training_hours'], kde=True)

The graph reveals the most of the Candidates train for 20 to 50 Hours

---
10. Before moving on to the prediction part we will check relation between few features and Training Hours, as people looking for a Job Change tend to Train for longer hours.

<ol>
    <li><span style='color:red'>Relevent Experience</span> vs <span style='color:blue'>Training Hours</span></li>
    <li><span style='color:red'>Education Level</span> vs <span style='color:blue'>Training Hours</span></li>
    <li><span style='color:red'>Major Discipline</span> vs <span style='color:blue'>Training Hours</span></li>
    <li><span style='color:red'>Company Size</span> vs <span style='color:blue'>Training Hours</span></li>
    <li><span style='color:red'>Company Type</span> vs <span style='color:blue'>Training Hours</span></li>
    <li><span style='color:red'>Last New Job</span> vs <span style='color:blue'>Training Hours</span></li>
</ol>

In [None]:
figures, axes = plt.subplots(3, 2, figsize=(16, 14))

sns.scatterplot('relevent_experience', 'training_hours', data=aug_train, hue='target', ax=axes[0,0])
sns.scatterplot('education_level', 'training_hours', data=aug_train, hue='target', ax=axes[0,1])
sns.scatterplot('major_discipline', 'training_hours', data=aug_train, hue='target', ax=axes[1,0])
sns.scatterplot('company_size', 'training_hours', data=aug_train, hue='target', ax=axes[1,1])
sns.scatterplot('company_type', 'training_hours', data=aug_train, hue='target', ax=axes[2,0])
sns.scatterplot('last_new_job', 'training_hours', data=aug_train, hue='target', ax=axes[2,1])

plt.show()

---
**5. Let's get to the prediction part**

1. Data Preprocessing Phase

<ul>
    <li>Using pandas get_dummies() to convert the Categorical Data to Numerical Features</li>
    <li>Then Removing the Unnecessary Columns</li>
</ul>

In [None]:
list_of_columns = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level',
                   'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']

aug_train_dummies_df = []

for col in list_of_columns:
    dummy_train_df = pd.get_dummies(aug_train[col])
    aug_train_dummies_df.append(dummy_train_df)



list_of_columns.append('enrollee_id')    

aug_train_dummies_df.insert(0, aug_train.drop(columns=list_of_columns))

aug_train = pd.concat(aug_train_dummies_df,axis=1)

2. Split the Data in to Training(70%) and Testing(30%) and train the model and Check it accuracy

In [None]:
X = aug_train.dropna().drop(columns=['target']).values
y = aug_train.dropna()['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=80)

pipeline = make_pipeline(StandardScaler(), SVC(kernel='sigmoid'))
pipeline.fit(X_train, y_train)
prediction = pipeline.predict(X_test)

print(f'Accuracy of the Model is {accuracy_score(y_test, prediction)}')

3. Using Seaborm HeatMap we will plot Confussion Matrix

In [None]:
conf_matrix = confusion_matrix(y_test, prediction)
sns.heatmap(conf_matrix, linewidths=1, cmap=cmap, annot=True)
plt.plot()

---
# Thank You.! Upvote if you like it.