## Importing Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score

## Feature dictionary

In [None]:
cols = pd.read_excel(r'../input/employees-attrition-analysis/data_dictionary.xlsx')

In [None]:
cols

## Loading the dataset

In [None]:
df = pd.read_csv(r'../input/employees-attrition-analysis/whole data.csv')

In [None]:
df

In [None]:
df.shape

## Checking for missing values

In [None]:
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

## Null values are very few, we can drop them without affecting data set 

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

## Label encoding: to convert categorical values into continuous values

In [None]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()

for i in df.columns:
    if isinstance(df[i][0],str):
        df[i] = encoder.fit_transform(df[i])

In [None]:
df

## The data is imbalanced

In [None]:
df.Attrition.value_counts()

In [None]:
X = df.drop(['Attrition'], axis=1)
y =df.Attrition

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [None]:
lr = LogisticRegression()


In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.2, random_state = 4589)

In [None]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
x_train = Scaler_X.fit_transform(x_train)
x_test = Scaler_X.transform(x_test)

In [None]:
lr.fit(x_train, y_train)
lr.score(x_train, y_train)

In [None]:
pred = lr.predict(x_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
f1_score(y_test, pred)

In [None]:
recall_score(y_test,pred)

## As the data is imbalance, accuracy is might not be the best option for measuring performance.
## Hence if we look at  F1 score and recall score, they are very low

## Trying undersampling


In [None]:
df.reset_index(inplace=True)
li = list(df[df.Attrition == 0].sample(n=2910).index)
df = df.drop(df.index[li])

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.2, random_state = 489)

In [None]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
x_train = Scaler_X.fit_transform(x_train)
x_test = Scaler_X.transform(x_test)

In [None]:
lr.fit(x_train, y_train)
lr.score(x_train, y_train)

In [None]:
y_pred = lr.predict(x_test)

In [None]:
print(metrics.confusion_matrix(y_test, y_pred))

In [None]:
lr.score(x_test, y_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
recall_score(y_test,y_pred)

In [None]:
f1_score(y_test,y_pred)


## After undersampling the F1 score and recall score dropped. So undersampling is not the best option.

## Using SMOTE: Synthetic Minority Oversampling Technique
## SMOTE uses a nearest neighbors algorithm to generate new and synthetic data we can use for training our model.

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
Scaler_X = StandardScaler()
scaled_X = Scaler_X.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.25, random_state=27)

sm = SMOTE(random_state=27, sampling_strategy='auto')
X_train, y_train = sm.fit_sample(X_train, y_train)

In [None]:
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
smote_pred = lr.predict(X_test)

In [None]:
accuracy_score(y_test, smote_pred)

In [None]:
f1_score(y_test, smote_pred)

In [None]:
recall_score(y_test, smote_pred)

## Creating intercept and coefficient table to see how features are related to target.

In [None]:
feature_names = X.columns.values
summary_table = pd.DataFrame(columns = ['Feature_names'], data = feature_names)
summary_table['coeff']= np.transpose(lr.coef_)
summary_table

summary_table.index = summary_table.index +1
summary_table.iloc[0]= ['Intercept', lr.intercept_[0]]

summary_table.sort_index()

### We can say that SMOTE is best for this imbalanced data set.
### Now we need to select relevant features and see if we can increase the accuracy more or not!

In [None]:
#get correlations of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(40,40))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
print(df.Over18.value_counts())
print(df.StandardHours.value_counts())
print(df.EmployeeCount.value_counts())

In [None]:
#Dropping them as they are not relevant
df.drop(['StandardHours','EmployeeCount','EmployeeID','Over18'], inplace=True, axis=1)

In [None]:
X = df.drop(['Attrition'], axis=1)
y =df.Attrition

In [None]:
#Calculating VIF

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X_vif=add_constant(X)

pd.Series([variance_inflation_factor(X_vif.values, i) 
               for i in range(X_vif.shape[1])], 
              index=X_vif.columns)  

In [None]:
#get correlations of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

# Columns we should drop:



### 1) Job involvement: because its coefficient value is near 0 that means it does not have major effect on Attrition
### 2) Age: as it has high correlation with many features such as total working years, years at company and it does not affect Attrition that much.
### 3) Business travel the coefficient table shows that this feature has approximately zero effect on Attrition
### 4) Performance Rating: as it is highly correlated to percent salary hike and has less significance, VIF is also high
### 5) Years At company: it is correlated with years with current manager and age hence dropping it.
### 6) Stock Option level: approximately zero effect on Attrition
### 7) Distance from home: approximately zero effect on Attrition
### 8) Education: approximately zero effect on Attrition
### 9) Gender: approximately zero effect on Attrition
### 10) Department: approximately zero effect on Attrition

In [None]:
X.drop(['JobInvolvement','Age','BusinessTravel','PerformanceRating','YearsAtCompany','DistanceFromHome', 'StockOptionLevel'], inplace=True, axis=1)

In [None]:
X.drop(['Education','Gender','JobRole','Department'],inplace =True, axis =1)

In [None]:
Scaler_X = StandardScaler()
scaled_X = Scaler_X.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.25, random_state=27)

sm = SMOTE(random_state=27, sampling_strategy='auto')
X_train, y_train = sm.fit_sample(X_train, y_train)

In [None]:
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)

smote_pred = lr.predict(X_test)

# Checking accuracy

In [None]:
accuracy_score(y_test, smote_pred) 

In [None]:
f1_score(y_test, smote_pred)

In [None]:
recall_score(y_test, smote_pred)

In [None]:
## recall and F1 increased significantly

In [None]:
feature_names = X.columns.values
summary_table = pd.DataFrame(columns = ['Feature_names'], data = feature_names)
summary_table['coeff']= np.transpose(lr.coef_)
summary_table

summary_table.index = summary_table.index +1
summary_table.iloc[0]= ['Intercept', lr.intercept_[0]]




summary_table.sort_index()

### Environment Satisfaction, Job satisfaction, Marital Status, Total working years, Years since last promotion, Years with current managers are some important features to take into consideration if company wants to reduce its attrition rate.