# HR Analytics analysis for IBM HR Analytics

# Improting Library 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression

# Reading data

In [None]:
data_hr = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

# Data Summary

In [None]:
data_hr.head()

In [None]:
data_hr.groupby('BusinessTravel')['DistanceFromHome'].sum()

In [None]:
data_hr.groupby('Department')['DistanceFromHome'].sum()

In [None]:
data_hr.groupby('EducationField')['DistanceFromHome'].sum()


# One Hot Encoding

In [None]:
data_hr1 = pd.get_dummies(data_hr, columns=['JobRole', 'MaritalStatus', 'Over18', 'OverTime','Gender','BusinessTravel', 'Department','EducationField'])

data_hr1.describe()

In [None]:
data_hr1.columns

# Drop one column for each one hot encoding

In [None]:
data_hr1 = data_hr1.drop(columns=['JobRole_Healthcare Representative', 'MaritalStatus_Divorced', 'OverTime_No','Gender_Male','BusinessTravel_Non-Travel','Department_Human Resources','EducationField_Human Resources'])

# Checking Data for Null values

In [None]:
data_hr1.isnull().sum()

In [None]:
Column_names = data_hr1.columns
Column_names 

# Train and Test Split

In [None]:
X=data_hr1[['Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Married', 'MaritalStatus_Single', 'Over18_Y',
       'OverTime_Yes', 'Gender_Female', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Research & Development',
       'Department_Sales', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree']]
y=data_hr1[['Attrition']].replace({'Yes': 1, 'No': 0})
X_train,X_test,Y_train,Y_test=train_test_split(X,y,random_state=0)

# Gradient Boost Method

In [None]:
model = XGBRegressor()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

ActVPred = pd.DataFrame({'Actual': Y_test['Attrition'], 'Predicted': Y_pred})
print(ActVPred)

#Checking the accuracy of Linear Regression
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test['Attrition'], Y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test['Attrition'], Y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test['Attrition'], Y_pred)))

# Counting Correct prediction

In [None]:
Count_row = []
index = 0

for i, row in ActVPred.iterrows():
    if (row['Actual'] < 1):
        if (row['Predicted'] < 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    else:
        if (row['Predicted'] >= 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    index = index + 1



# Calculating Accuracy

In [None]:
print('--------------------------------------------------------------------------')
print('XGBoost:')
print('Traning Model accruracy scores: {:.3f}'.format(Count_row.count(1)/index))

# Logistic regression

In [None]:
log_reg=LogisticRegression(C=1000,max_iter=50000)
log_reg.fit(X_train, Y_train)


print('--------------------------------------------------------------------------')
print('Logistic Regression:')
print('Traning Model accruracy scores: {:.3f}'.format(log_reg.score(X_train,Y_train)))
print('Test Model accruracy scores: {:.3f}'.format(log_reg.score(X_test,Y_test['Attrition'])))
print('--------------------------------------------------------------------------')

# From the above two calculation we can see that Logistic Regression has graeter accuracy than XGBoost