In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df=pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv',index_col='EmployeeNumber')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.isna(),cmap='viridis',cbar=False, yticklabels=False);

In [None]:
#no null value present as seen from above heatmap.

In [None]:
for col in df.columns:
    print("{}:{}".format(col,df[col].nunique()))
    print("=======================================")

In [None]:
df.drop(columns=['Over18','StandardHours','EmployeeCount'],inplace=True)

In [None]:
df['Attrition']=df['Attrition'].map({'Yes':1, 'No':0})

In [None]:
categorical_col=[]
for col in df.columns:
    if df[col].dtype== object and df[col].nunique()<=50:
        categorical_col.append(col)
print(categorical_col)

In [None]:
for col in categorical_col:
    print("{}:\n{}".format(col,df[col].value_counts()))
    print("=======================================")

In [None]:
df.columns

In [None]:
sns.countplot(x='Attrition',data=df)

In [None]:
sns.countplot(x='Attrition',hue='PerformanceRating',data=df)

In [None]:
sns.countplot(x='Attrition',hue='JobInvolvement',data=df)

In [None]:
sns.scatterplot(x='Age',y='MonthlyIncome',data=df)

In [None]:
sns.kdeplot(df['Age'],df['MonthlyIncome'],shade=True,cbar=True)

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(df.corr(),cmap='RdYlGn',annot=True,fmt='.2f')

1.Self relation i.e of a feature to itself is equal to 1 as expected.

2.JobLevel is highly related to Age as expected as aged employees will generally tend to occupy higher positions in the company.

3.PerformanceRating is highly related to PercentSalaryHike which is quite obvious.

4.MonthlyIncome is very strongly related to joblevel as expected as senior employees will definately earn more.

5.YearsAtCompany is related to YearsInCurrentRole.

6.Also note that TotalWorkingYears is highly related to JobLevel which is expected as senior employees must have worked for a larger span of time.

7.YearsWithCurrManager is highly related to YearsAtCompany.

In [None]:
df.corr()['Attrition'].sort_values(ascending=False)

In [None]:
sns.set(font_scale=2)
plt.figure(figsize=(30,30))
for i,col in enumerate(categorical_col,1):
    plt.subplot(3,3,i)
    sns.barplot(x=f"{col}",y='Attrition',data=df)
    plt.xticks(rotation=90)
plt.tight_layout()


In [None]:
sns.set(font_scale=1)
sns.boxplot(x='JobRole',y='MonthlyIncome',data=df)
plt.xticks(rotation=90);

In [None]:
sns.boxplot(x='EducationField',y='MonthlyIncome',data=df)
plt.xticks(rotation=90);

In [None]:
sns.violinplot(x='EducationField',y='MonthlyIncome',data=df,hue='Attrition',color='Yellow',split=True)
plt.legend(bbox_to_anchor=(1.2,0.65))
plt.xticks(rotation=45);

In [None]:
plt.subplots(figsize=(15,5))
sns.countplot(x='TotalWorkingYears',data=df)

In [None]:
plt.figure(figsize=(6,6))
plt.pie(df['EducationField'].value_counts(),labels=df['EducationField'].value_counts().index,autopct='%.2f%%');

In [None]:
df['EducationField'].value_counts()

In [None]:
df.groupby(by='JobRole')["PercentSalaryHike","YearsAtCompany","TotalWorkingYears","YearsInCurrentRole","WorkLifeBalance"].mean()

In [None]:
plt.figure(figsize=(6,6))
plt.pie(df['JobRole'].value_counts(),labels=df['JobRole'].value_counts().index,autopct='%.2f%%');
plt.title('Job Role Distribution',fontdict={'fontsize':22});

In [None]:
plt.figure(figsize=(14,5))
sns.countplot(x='Age',data=df)

In [None]:
sns.barplot(x='Education',y='MonthlyIncome',hue='Attrition',data=df)
plt.legend(bbox_to_anchor=(1.2,0.6))

In [None]:
sns.barplot(y='DistanceFromHome',x='JobRole',hue='Attrition',data=df,dodge=False,alpha=0.4,palette='twilight')
plt.xticks(rotation=90);
plt.legend(bbox_to_anchor=(1.2,0.6));

ENCODING THE CATEGORICAL COLUMNS.

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le= LabelEncoder()

In [None]:
for col in categorical_col:
    df[col]=le.fit_transform(df[col])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data= df.copy()

In [None]:
X= data.drop('Attrition',axis=1)
y=data['Attrition']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

USING TREE DECISION METHOD FOR PREDICTION.

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model= DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
pred= model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,pred))

ACCURACY USING DECISION TREE IS 76%. AND THE CONFUSION MATRIX COMES OUT TO BE.

In [None]:
print(confusion_matrix(y_test,pred))

NOW, WE WILL BE TUNING THE HYPERPARAMETERS OF DECISION TREE USING RANDOMIZED SEARCH CROSS VALIDATION
METHOD FOR IMPROVING THE ACCURACY OF THE MODEL.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params={"criterion":("gini", "entropy"),
        "splitter":("best", "random"), 
        "max_depth":(list(range(1, 20))), 
        "min_samples_split":[2, 3, 4], 
        "min_samples_leaf":list(range(1, 20))}

In [None]:
tree_random= RandomizedSearchCV(model,params,n_iter=100,n_jobs=-1,cv=3,verbose=2)

In [None]:
tree_random.fit(X_train,y_train)

In [None]:
tree_random.best_estimator_

In [None]:
model=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='random')

In [None]:
model.fit(X_train,y_train)
pred=model.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

WE CAN SEE THAT WE HAVE IMPROVED THE ACCURACY FOR DECISION TREE TO 83% USING RANDOM SEARCH CV METHOD. AND THE CONFUSION MATRIX IS FOUND TO BE.

In [None]:
print(confusion_matrix(y_test,pred))

NOW TRYING TO MAKE A MODEL USING RANDOM FOREST CLASSIFIER

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc= RandomForestClassifier(n_estimators=100)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
rfc_pred= rfc.predict(X_test)

In [None]:
print(classification_report(y_test,rfc_pred))

In [None]:
print(confusion_matrix(y_test,rfc_pred))

Accuracy of this model is 85%.