In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(rc={'figure.figsize': [10, 10]}, font_scale=1.2)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
df = pd.read_csv('../input/human-resources-data-set/HRDataset_v14.csv')
df.head(20)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(),cmap="YlGnBu",annot=True)

# Data exploration 

In [None]:
# Check unique values for each fields except a few ones not so relevant
for c in df.columns:
    if df[c].dtype == object and (c not in ('Employee_Name', 'DOB', 'DateofHire', 'DateofTermination', 'LastPerformanceReview_Date', 'ManagerName')):
        print(c, df[c].unique())

In [None]:
df['MarriedID'].value_counts()

In [None]:
df['MaritalStatusID'].value_counts()
df['MaritalDesc'].value_counts()

In [None]:
df['GenderID'].value_counts()
df['Sex'].value_counts()

In [None]:
df['EmpStatusID'].value_counts()

In [None]:
df['EmploymentStatus'].value_counts()

* (1,2,3=active , 4= Terminated for Cause , 5= Voluntarily Terminated)

In [None]:
EmpStatus= df[['EmpStatusID','EmploymentStatus']]
EmpStatus.head(20)

In [None]:
df['DeptID'].value_counts()
df['Department'].value_counts()

In [None]:
df['PerfScoreID'].value_counts()
df['PerformanceScore'].value_counts()

In [None]:
df['FromDiversityJobFairID'].value_counts()

In [None]:
df['PositionID'].value_counts()
df['Position'].value_counts()

In [None]:
df['ManagerID'].value_counts()
df['ManagerName'].value_counts()

In [None]:
#df['State'].value_counts()    
#df['Zip'].value_counts()

In [None]:
df['DOB'].unique()

In [None]:
df['MaritalDesc'].value_counts()

In [None]:
df['CitizenDesc'].value_counts()

In [None]:
df['HispanicLatino'].value_counts()

In [None]:
df['RaceDesc'].value_counts()

In [None]:
df['DateofHire'].unique()

In [None]:
df['DateofTermination'].value_counts()
df['Termd'].value_counts()

In [None]:
df['TermReason'].value_counts()

In [None]:
df['RecruitmentSource'].value_counts()

In [None]:
df['EngagementSurvey'].unique()

In [None]:
df['EmpSatisfaction'].value_counts()

In [None]:
df['SpecialProjectsCount'].value_counts()

In [None]:
df['LastPerformanceReview_Date'].value_counts()

In [None]:
df['DaysLateLast30'].value_counts()

In [None]:
df['Absences'].unique()

# Featuer Engineering 

In [None]:
df.head(5)

In [None]:
from datetime import datetime
df['DateofHire'] = pd.to_datetime(df['DateofHire'], format='%m/%d/%Y')
df['DateofTermination'] = pd.to_datetime(df['DateofTermination'], format='%m/%d/%Y')
df[['DateofHire','DateofTermination']]

* we wnat extract experience for all emp, but part of them was termination 

# Data Visualization 

In [None]:
df.hist(bins=50,figsize=(10,10))

In [None]:
sns.pairplot(df)

In [None]:
df['PerformanceScore'].unique()

How is PerformanceScore distributed in the dataset ?

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='PerformanceScore', data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='PerformanceScore', data=df, hue = 'Sex', palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='PerformanceScore', data=df, hue = 'Department', palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='PerformanceScore', data=df, hue = 'Termd', palette='viridis')

* Social Data

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Sex', data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='MaritalDesc', data=df, hue = 'Sex', palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='MaritalDesc', data=df, hue = 'Termd', palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='MaritalDesc', data=df, hue = 'CitizenDesc', palette='viridis')

In [None]:
plt.figure(figsize=(20,20))
sns.catplot(kind='box', x='Salary', y='MaritalDesc', data=df, palette='viridis',  col='PerformanceScore', hue='Sex')

* Department Data

In [None]:
df['Department'].unique()

In [None]:
perfs = ['Exceeds', 'Needs Improvement', 'PIP', 'Fully Meets']
dps = ['Production       ', 'IT/IS', 'Software Engineering',
       'Admin Offices', 'Executive Office', 'Sales']
palette1 ={"IT/IS": "C0", "Production       ": "C1", "Software Engineering": "C2", "Admin Offices":"C3", "Sales": "C4", "Executive Office":"C5"}
palette2 ={"Exceeds": "C0", "Needs Improvement": "C1", "PIP": "C2", "Fully Meets":"C3"}

The histogram below allows us to see the conditional probability of Department given Perfomance Score. It's a way to grasp the overall performance of the company but knowing the fact the production department is much bigger than other departments, we can do better taking the performance score given the department.

In [None]:
plt.figure(figsize=(15, 10))
for dp in dps:
    sns.histplot(x='Department', hue='PerformanceScore', multiple='stack', shrink=.9, stat='probability',palette=palette2, data=df[df['Department']==dp])

The sales department needs to be watched since it's the department which requires the most performance improvement plan (PIP). These have been set to address failures to meet specific job goals or to ameliorate behavior-related concerns.

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, hue= 'Termd', palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, hue= 'Sex',  palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, hue = 'MaritalDesc',  palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, hue = 'Termd',  palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, hue = 'PerformanceScore',  palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, hue = 'RaceDesc',  palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, hue = 'RecruitmentSource',  palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Department', data=df, hue = 'SpecialProjectsCount',  palette='viridis')

It's also interesting to see the distribution of salaries between individuals having different performance scores and differents Department supervising them :

In [None]:
plt.figure(figsize=(20,20))
sns.catplot(kind='box', x='Salary', y='Department', data=df, palette='viridis',  col='PerformanceScore', hue='Sex')

In [None]:
sns.catplot(x='Department', y='Salary', hue='PerformanceScore', kind='box', data=df, height=10, aspect=3)

* Position Data

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='Position', data=df,  hue = 'Sex', palette='viridis')

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='Position', data=df,  hue = 'PerformanceScore', palette='viridis')

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='Position', data=df,  hue = 'RecruitmentSource', palette='viridis')

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='Position', data=df,  hue = 'Termd', palette='viridis')

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='Position', data=df,  hue = 'MaritalDesc', palette='viridis')

How many managers are there in the company ?

In [None]:
print(len(df['ManagerName'].unique()), "unique managers are currently working in the company :", df['ManagerName'].unique())

In [None]:
sns.catplot(y='ManagerID', x='PerformanceScore', kind='box', data=df, height=10, aspect=1)

Which managers supervise the most in the company ?

In [None]:
ManagersIds = df.groupby('ManagerID')['PerformanceScore'].count().sort_values(ascending=False).index
sns.catplot(y='ManagerID', hue='PerformanceScore', kind='count', data=df, order=ManagersIds,  height=10, aspect=1)

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x="ManagerID", y="SpecialProjectsCount", hue="Sex",data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x="ManagerID", y="Absences", hue="Sex",data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x="ManagerID", y="Salary", hue="Termd",data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x="ManagerID", y="Salary", hue="Sex",data=df, palette='viridis')

It's also interesting to see the distribution of salaries between individuals having different performance scores and differents managers supervising them :

In [None]:
sns.catplot(x='ManagerID', y='Salary', hue='PerformanceScore', kind='box', data=df, height=10, aspect=3)

What is the overall diversity profile of the organization ?

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Sex', data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y=df['Department'], hue=df['Sex'])

In [None]:
sns.catplot(x='MaritalDesc', hue='Department', data=df, kind="count",height=7, aspect=1)

In [None]:
plt.figure(figsize=(10,5))
sns.catplot(y='RaceDesc', data=df, kind='count', height=8, aspect=1)

In [None]:
palette1 ={"M ": "C0", "F": "C1"}
races = ['White', 'Black or African American', 'Two or more races', 'Asian', 'Hispanic', 'American Indian or Alaska Native']
palette2 ={'White':"C0", 'Black or African American':"C1", 'Two or more races':"C2", 'Asian':"C3", 'Hispanic':"C4", 'American Indian or Alaska Native':"C5"}
plt.figure(figsize=(15, 8))
for r in races:
    sns.histplot(x='RaceDesc', hue="Sex", multiple="stack", data=df[df['RaceDesc']==r], palette=palette1, stat='probability', shrink=.8)

In [None]:
plt.figure(figsize=(15, 8))
sns.histplot(x='Department', hue='RaceDesc', multiple='stack', data=df)

In [None]:
dps = ['Sales', 'IT/IS', 'Software Engineering',
       'Admin Offices', 'Executive Office','Production']
plt.figure(figsize=(15, 8))
for d in dps:
    sns.histplot(x='Department', hue='RaceDesc', palette=palette2, stat='probability', multiple='stack', shrink=0.9, data=df[df['Department']==d])

What are our best recruiting sources if we want to ensure a diverse organization ?

In [None]:
df['RecruitmentSource'].unique()

In [None]:
sns.catplot(y='RecruitmentSource', kind='count', order=df.groupby('RecruitmentSource')['EmpID'].count().sort_values(ascending=False).index, data=df)

Are there areas of the company where pay is not equitable ?

distribution of salaries in the company

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(df['Salary'])

In [None]:
sns.jointplot(x='SpecialProjectsCount', y='Salary', data=df, kind='reg', height=8, color='m')

In [None]:
sns.jointplot(x='Absences', y='Salary', data=df, kind='reg', height=8, color='m')

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x=df['Sex'], y=df['Salary']/1000, hue=df['MaritalDesc'])

In [None]:
sns.catplot(y='Department', x='Salary', col='Sex', data=df, kind="box", height = 10, aspect = 1)

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(y="RecruitmentSource", x="Salary", hue="Sex",data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x="PerformanceScore", y="Salary", hue="Sex",data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x="Termd", y="Salary", hue="Sex",data=df, palette='viridis')

In [None]:
df['HispanicLatino'].value_counts()

In [None]:
df['HispanicLatino'].replace("Yes",1,inplace=True)
df['HispanicLatino'].replace("yes",1,inplace=True)
df['HispanicLatino'].replace("No",0,inplace=True)
df['HispanicLatino'].replace("no",0,inplace=True)

In [None]:
df['HispanicLatino'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='HispanicLatino', data=df, palette='viridis')

In [None]:
sns.jointplot(x='HispanicLatino', y='Salary', data=df, kind='reg', height=8, color='m')

In [None]:
sns.boxplot(x="HispanicLatino", y="Salary", hue="Sex",data=df, palette='viridis')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='HispanicLatino', data=df, hue = 'Termd', palette='viridis')

In [None]:
df.info()

 * Select the needed columns

In [None]:
df_select = df[['MaritalDesc','Sex', 'EmploymentStatus', 'Department', 'PerformanceScore',  'Position', 'CitizenDesc', 'HispanicLatino',
          'RaceDesc', 'ManagerName', 'RecruitmentSource', 'EmpSatisfaction', 'SpecialProjectsCount','Salary', 'DaysLateLast30', 'Absences', 'Termd']]

In [None]:
df_select.head()

In [None]:
df_select.isnull().sum()

In [None]:
df_select.columns[:-5]

In [None]:
df_dummies = pd.get_dummies(df_select, columns=df_select.columns[:-5], drop_first=True)
df_dummies

In [None]:
df_dummies.info()

In [None]:
x = df_dummies.drop('Termd', axis=1)
y = df_dummies['Termd']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=22)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state=22)
oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)

In [None]:
y_train

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(x_train)

In [None]:
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
models = {
    "   K-Nearest Neighbors": KNeighborsClassifier(),
    "   Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "         Decision Tree": DecisionTreeClassifier(),
    "        Neural Network": MLPClassifier(),
    "         Random Forest": RandomForestClassifier(n_estimators=500),
    "         XGBClassifier": XGBClassifier(n_estimators=700)
}

for name, model in models.items():
    model.fit(x_train, y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name + " Accuracy: {:.2f}%".format(model.score(x_test, y_test) * 100))
    y_pred = model.predict(x_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    

In [None]:
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn.feature_selection import SelectKBest

In [None]:
selector = SelectKBest(k=10, score_func=f_classif)

In [None]:
selector.fit(x_train, y_train)

In [None]:
selector.get_support(indices=True)

In [None]:
x.columns[selector.get_support(indices=True)]

In [None]:
Select_columns = pd.DataFrame({'Important_Feature':x.columns[selector.get_support(indices=True)],
                  'Score':selector.get_support(indices=True)} )

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='Score', y='Important_Feature', data=Select_columns)