In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objs as go


In [None]:
df = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')



In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns 


In [None]:
df.describe()


In [None]:
df.describe(include=['O'])

In [None]:
df.info()

In [None]:

import missingno as msno
msno.matrix(df)

In [None]:
msno.bar(df, color = 'b', figsize = (15,8))

In [None]:
df.corr()

In [None]:
df['Attrition'].value_counts()


In [None]:
sns.countplot(df.Attrition)
plt.title("Attrition",fontsize=15)

In [None]:
plt.subplots(figsize=(15,5))
df['BusinessTravel'].value_counts(normalize = True)
df['BusinessTravel'].value_counts(dropna = False).plot.bar(color=['black', 'red', 'cyan'])
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
df['Department'].value_counts(normalize = True)
df['Department'].value_counts(dropna = False).plot.bar(color=['black', 'red', 'blue'])
plt.show()

In [None]:
plt.figure(figsize=[20,20])

plt.subplot(411)
sns.countplot(x=df['JobSatisfaction'],hue=df['Attrition'])
plt.subplot(412)
sns.countplot(x=df['MaritalStatus'],hue=df['Attrition'])
plt.subplot(413)
sns.countplot(x=df['JobRole'],hue=df['Attrition'])
plt.subplot(414)
sns.countplot(x=df['JobLevel'],hue=df["Attrition"])

plt.show()

In [None]:
fig = px.histogram(
    df, 
    "DailyRate", 
    nbins=80, 
    title ='DailyRate', 
    width=800,
    height=500
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "MonthlyIncome", 
    nbins=80, 
    title ='MonthlyIncome', 
    width=800,
    height=500
)

fig.show()

In [None]:
fig = go.Figure(
    data=go.Violin(
        y=df['DistanceFromHome'], 
        x0='DistanceFromHome'
    )
)

fig.show()

In [None]:
fig = go.Figure(
    data=go.Violin(
        y=df['Education'], 
        x0='Education',
        marker = {'color' : 'red'}
    )
)

fig.show()

In [None]:

fig = px.bar(
    df, 
    x='EducationField', 
    orientation='v', 
    title='EducationField', 
    width=600,
color_discrete_sequence=["fuchsia"]

)

fig.show()

In [None]:
df.hist(bins=50, figsize=(25,15))
plt.show()

In [None]:
trace0 = go.Box(
    name = "JobRole",
    y = df["JobRole"]
)

trace1 = go.Box(
    name = "JobSatisfaction",
    y = df["JobSatisfaction"]
)

trace2 = go.Box(
    name = "MaritalStatus",
    y = df["MaritalStatus"]
)

trace3 = go.Box(
    name = "MonthlyIncome",
    y = df["MonthlyIncome"] 
)

trace4 = go.Box(
    name = "MonthlyRate",
    y = df["MonthlyRate"]
)

trace5 = go.Box(
    name = "NumCompaniesWorked",
    y = df["NumCompaniesWorked"]
)
data = [trace0, trace1, trace2 , trace3 , trace4 , trace5 ]
plotly.offline.iplot(data)

In [None]:
fig = px.scatter_matrix(df, dimensions=['EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel'])
fig.show()

In [None]:
fig = px.pie(df, names = "PercentSalaryHike", title = "PercentSalaryHike ")
fig.show()

In [None]:
fig = px.pie(df, names = "PerformanceRating", title = "PerformanceRating ")
fig.show()

In [None]:
fig = px.pie(df, names = "StockOptionLevel", title = "StockOptionLevel ", color_discrete_sequence=px.colors.qualitative.Set2)
fig.show()

In [None]:
fig = px.pie(df, names = "YearsSinceLastPromotion", title = "YearsSinceLastPromotion ", color_discrete_sequence=px.colors.qualitative.Set2)
fig.show()

In [None]:
fig = px.bar(df, x="MonthlyIncome", y="Attrition",
              barmode='group',
             height=600)
fig.show()

In [None]:
fig = px.bar(df, x="JobRole", y="Attrition",
              barmode='group',
             height=600)
fig.show()

In [None]:
fig = px.bar(df, x="YearsSinceLastPromotion", y="Attrition",
              barmode='group',
             height=600)
fig.show()

In [None]:
fig = px.bar(df, x="JobRole", y="Gender",
              barmode='group',
             height=600)
fig.show()

In [None]:
fig = px.bar(df, x="EducationField", y="Gender",
              barmode='group',
             height=600)
fig.show()

In [None]:
#correlation map
df.corr
f,ax = plt.subplots(figsize=(30,30))
sns.heatmap(df.corr(), annot =True, linewidth =".5", fmt =".2f", cmap='coolwarm')
plt.show()

In [None]:
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours','EmployeeNumber','Over18','StandardHours','EmployeeCount'], axis="columns", inplace=True)

In [None]:
df.Attrition.replace({'Yes': 1, 'No': 0}, inplace=True)

df.BusinessTravel.replace({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}, inplace=True)

df.Department.replace({'Sales': 0, 'Research & Development': 1, 'Human Resources': 2}, inplace=True)

df.Gender.replace({'Female': 0, 'Male': 1}, inplace=True)

df.MaritalStatus.replace({'Single': 0,'Married': 1, 'Divorced': 2}, inplace=True)

df.OverTime.replace({'No': 0, 'Yes': 1}, inplace=True)

df.EducationField.replace({'Life Sciences': 0, 'Medical': 1, 'Marketing': 2, 'Technical Degree': 3, 'Human Resources': 4, 'Other': 5}, inplace=True)

df.JobRole.replace({
'Sales Executive': 0, 'Research Scientist': 1, 'Laboratory Technician': 2,'Manufacturing Director': 3,'Healthcare Representative': 4,'Manager': 5,
    'Sales Representative': 6,'Research Director': 7,'Human Resources': 8
}, inplace=True)

In [None]:


X = df.drop(columns=["Attrition"])
y = df["Attrition"]



In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

#Splitting data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44, shuffle =True)

#Splitted Data
#print('X_train shape is ' , X_train.shape)
#print('X_test shape is ' , X_test.shape)
#print('y_train shape is ' , y_train.shape)
#print('y_test shape is ' , y_test.shape)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


GBCModel = GradientBoostingClassifier(n_estimators=100,max_depth=3,random_state=33) 
GBCModel.fit(X_train, y_train)

#Calculating Details
print('GBCModel Train Score is : ' , GBCModel.score(X_train, y_train))
print('GBCModel Test Score is : ' , GBCModel.score(X_test, y_test))
print('----------------------------------------------------')

#Calculating Prediction
y_pred = GBCModel.predict(X_test)
y_pred_prob = GBCModel.predict_proba(X_test)
print('Predicted Value for GBCModel is : ' , y_pred[:10])


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

#Calculating Confusion Matrix
CM = confusion_matrix(y_test, y_pred)
print('Confusion Matrix is : \n', CM)

# drawing confusion matrixtab20c
sns.heatmap(CM, center = True ,cmap='PuBu')
plt.show()