# Importing the dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import numpy as np


# Loading the data

In [None]:
df=pd.read_csv('/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

In [None]:
df.shape

The dataset consist of 1470 rows and 35 columns

In [None]:
df['Attrition']=df['Attrition'].map({'Yes':1,'No':0})


In [None]:
df.columns

Using info to get information of columns and their data types.As you see there are int and object types

In [None]:
df.info()

In [None]:
df.describe()

Diving features into numerical and categorical.

In [None]:
num_features=df.select_dtypes(include=['float64','int64'])
cat_features=df.select_dtypes(include=['O'])

In [None]:
num_features.columns

# Checking null values using heatmap

In [None]:
sns.heatmap(df.isnull(),yticklabels=False, cbar=False)

As you see there are no null values in the data set.

In [None]:
import matplotlib.gridspec as gridspec
def numerical_analysis(feature,data):
    fig=plt.figure(constrained_layout=True)
    grid=gridspec.GridSpec(ncols=1,nrows=1,figure=fig)
   # ax1=fig.add_subplot(grid[0,0])
    #sns.distplot(data[feature],ax=ax1)
    #ax2=fig.add_subplot(grid[1,:])
    #sns.countplot(x=feature,data=df,hue='Attrition',ax=ax2)
    ax3=fig.add_subplot(grid[0,0])
    sns.boxplot(x='Attrition',y=feature,data=df,ax=ax3)
    

Visualizing the numerical columns 

In [None]:
df.hist(figsize = (20,20))
plt.show()

As you see above there are continuous and discrete variables.
Countinuous variables-->Age,dailyRate,yearswithcurrmanager,Yearsatcompany,yearsincurrentrole,yearssincelastpromotion and soon.
Descrete variable-->Education,Employeecount,Jobsatisfaction,Performancerating,Stockoptionlevel and soon

In [None]:
cat_features.columns

# Visualizing target column attrition

In [None]:
sns.countplot(x='Attrition',data=df)

As you see,data is imbalanced

# Visualizing categorical columns

In [None]:
sns.countplot(x='MaritalStatus',hue='Attrition',data=df)

From MaritalStatus,we can see that single and married have more attrition than divorced.

In [None]:
sns.countplot(x='BusinessTravel',hue='Attrition',data=df)

From bussinesstravel,people who travel rarely have more attrition rate.

In [None]:
sns.countplot(x='Department',hue='Attrition',data=df)

From department,Employees from research and development has more attrition,Human resourses have found very less attriction.

In [None]:
sns.countplot(x='Gender',hue='Attrition',data=df)

From gender,male employees have found more attriction than female.

In [None]:
plt.subplots(figsize=(20,5))
sns.countplot(x='JobRole',hue='Attrition',data=df)

From roles of employeee,it is found that employees who are salesexecutive and research scientist have more attrition rate. 

In [None]:
#numerical_analysis('Education',df)
sns.countplot(x='Over18',hue='Attrition',data=df)

In [None]:
df['PerformanceRating'].value_counts()
sns.countplot(x='PerformanceRating',hue='Attrition',data=df)

Employees having performance rating of 3 and 4 have more attrition rate.

In [None]:
sns.countplot(x='RelationshipSatisfaction',hue='Attrition',data=df)

Employees having relationship satisfaction high or very high have good attrition rate.

In [None]:
sns.countplot(x='JobLevel',hue='Attrition',data=df)

# Finding outliers using box plots

In [None]:
for i in num_features:
    numerical_analysis(i,df)

From above box plots,we see columns like Age,Joblevel,Monthlyincome,Totalworking years,Yearsatcompany and soon have outliers.

# Encoding categorical columns

In [None]:
from sklearn.preprocessing import LabelEncoder
label= LabelEncoder()
df["Attrition"]=label.fit_transform(df["Attrition"])
df["BusinessTravel"]=label.fit_transform(df["BusinessTravel"])
df["Department"]=label.fit_transform(df["Department"])
df["EducationField"]=label.fit_transform(df["EducationField"])
df["Gender"]=label.fit_transform(df["Gender"])
df["JobRole"]=label.fit_transform(df["JobRole"])
df["MaritalStatus"]=label.fit_transform(df["MaritalStatus"])
df["OverTime"]=label.fit_transform(df["OverTime"])


df.drop(["EmployeeNumber","Over18","EmployeeCount","StandardHours"],axis=1,inplace=True)

# Finding outliers using z-score

In [None]:
from scipy.stats import zscore
z_score=abs(zscore(df))
print("The shape of dataset before removing outliers",df.shape)
df=df.loc[(z_score<3).all(axis=1)]
print("The shape of dataset after removing outliers",df.shape)

# Finding skewness of data

In [None]:
skewness=df.skew()
skewness

Some columns have skewness more than 0.5 .So we will transform them using log

# Feature Importance

In [None]:
Y=df['Attrition']
X=df.drop(['Attrition'],axis=1)
type(X)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dc=DecisionTreeClassifier()
dc.fit(X,Y)

In [None]:
feat_importances = pd.Series(dc.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()