## Procedure
This kernel uses multiple classification algorithms as below
<ul>
    <li>Multi-layer Perceptron classifier (MLP)</li>
    <li>KNeighborsClassifier</li>
    <li>AdaBoostClassifier</li>
    <li>BaggingClassifier</li>
    <li>GradientBoostingClassifier</li>
    <li>RandomForestClassifier</li>
</ul>
    
Also this kernel does lot of indepth EDA like 
<ul>
    <li>univalirate analysis</li>
    <li>bivariate analysis</li>
    <li>correlation analysis</li>
</ul>

This kernel utilize the below methods for evaluation of model accuracy
<ul>
    <li>auc</li>
    <li>classification report</li>
    <li>confusion matrix</li>
</ul>

## General Library Imports

In [19]:
!pip install matplotlib



In [24]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.decomposition import TruncatedSVD
# from sklearn.impute import SimpleImputer
# from sklearn.pipeline import Pipeline


# # libraries for models
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

# # metrics evaluation libraries
# from sklearn.metrics import auc, classification_report, confusion_matrix, roc_curve, RocCurveDisplay

## Data loading

In [None]:
project_data = pd.read_csv("../dataset/Employee Analysis Attrition Report/HR Employee Attrition.csv")

## Initial Analysis

In [None]:
project_data.head()

In [None]:
project_data.info()

In [None]:
project_data.describe()

In [None]:
project_data.shape

In [None]:
project_data.columns

In [None]:
project_data.isna().sum()

## Eploratory Data Analysis

### uivariate Analysis

In [None]:
numeric_columns = [column for column in project_data.columns if project_data[column].dtype == 'int64']
print(numeric_columns)

In [None]:
for column in numeric_columns:
    plt.figure(figsize=(12,8))
    sns.kdeplot(data=project_data, x=column, palette="crest")
    plt.show()

In [None]:
categorical_columns = [column for column in project_data.columns if project_data[column].dtype != 'int64']
print(categorical_columns)

In [None]:
for column in categorical_columns:
    plt.figure(figsize=(12,8))
    sns.countplot(x=project_data[column])
    plt.show()

### Bivariate Analysis

In [None]:
for column in numeric_columns:
    plt.figure(figsize=(12,8))
    sns.kdeplot(data=project_data, x=column, hue="Attrition", fill=True, alpha=.5, palette="crest")
    plt.show()

## Correlation Analysis

In [None]:
df1 = project_data.copy()

encoder = LabelEncoder()
for column in categorical_columns:
    df1[column] = encoder.fit_transform(df1[column])

plt.figure(figsize=(30,12))
corr = df1.corr()
sns.heatmap(corr, annot=True, cmap="YlGnBu")

## Data Preprocessing and Pipelining

In [None]:
X_train=project_data.drop(columns=["Attrition"])
y_train=project_data["Attrition"]

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

In [None]:
print('Train dataset shape:',X_train.shape)
print('Test dataset shape', y_train.shape)

In [None]:
numeric_columns = X_train.select_dtypes(exclude='object').columns
print(numeric_columns)
print('*'*100)
categorical_columns = X_train.select_dtypes(include='object').columns
print(categorical_columns)

In [None]:
numeric_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler(with_mean=True))
])

print(numeric_features)
print('*'*100)

categorical_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder()),
    ('scaling', StandardScaler(with_mean=False))
])

print(categorical_features)

processing = ColumnTransformer([
    ('numeric', numeric_features, numeric_columns),
    ('categorical', categorical_features, categorical_columns)
])

processing

## Generic Methods for Model Preparation & Metric Evaliation

In [None]:
def prepare_model(algorithm):
    model = Pipeline(steps= [
        ('processing',processing),
        ('pca', TruncatedSVD(n_components=3, random_state=12)),
        ('modeling', algorithm)
    ])
    model.fit(X_train, y_train)
    
    # Save
    filename = 'model'+ algorithm +'.pkl'
    print(filename)
#     pickle.dump(model, open(filename, 'wb'))

    return model

In [None]:
input_data = {
    "Age": [40],
    "BusinessTravel": ['Non-Travel'],
    "DailyRate": [1142],
    "Department": ['Research & Development'],
    "DistanceFromHome": [8],
    "Education": [2],
    "EducationField": ['Life Sciences'],
    "EmployeeCount": [1],
    "EmployeeNumber": [1552],
    "EnvironmentSatisfaction": [4],
    "Gender": ['Male'],
    "HourlyRate": [72],
    "JobInvolvement": [3],
    "JobLevel": [2],
    "JobRole": ['Healthcare Representative'],
    "JobSatisfaction": [4],
    "MaritalStatus": ['Divorced'],
    "MonthlyIncome": [4069],
    "MonthlyRate": [8841],
    "NumCompaniesWorked": [3],
    "Over18": ['Y'],
    "OverTime": ['Yes'],
    "PercentSalaryHike": [18],
    "PerformanceRating": [3],
    "RelationshipSatisfaction": [3],
    "StandardHours": [80],
    "StockOptionLevel": [0],
    "TotalWorkingYears": [8],
    "TrainingTimesLastYear": [2],
    "WorkLifeBalance": [3],
    "YearsAtCompany": [2],
    "YearsInCurrentRole": [2],
    "YearsSinceLastPromotion": [2],
    "YearsWithCurrManager": [2]
}

input_data_df = pd.DataFrame(input_data)

In [None]:
def prepare_confusion_matrix(algo, model):
    print(algo)
    plt.figure(figsize=(12,8))
    pred = model.predict(X_test)
    cm = confusion_matrix(y_test, pred)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax)
    plt.show()
    
    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 

In [None]:
def prepare_classification_report(algo, model):
    print(algo+' Report :')
#     pred = model.predict(X_test)
#     print(type(input_data_df))
    pred_testrow = model.predict(input_data_df)
#     print(classification_report(y_test, pred))
    print("My row prediction: ", pred_testrow)

In [None]:
def prepare_roc_curve(algo, model):
    print(algo)
    y_pred_proba = model.predict_proba(X_test)[::,1]
    #print(y_test)
    #print(y_pred_proba)
    fpr, tpr, thresholds = roc_curve(y_test,  y_pred_proba)
    roc_auc = auc(fpr, tpr)
    curve = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
    curve.plot()
    plt.show()

## Model Preparation

In [None]:
algorithms = [('bagging classifier', BaggingClassifier()), 
              ('KNN classifier', KNeighborsClassifier()), 
              ('Random Forest calssifier', RandomForestClassifier()), 
              ('Adaboost classifier', AdaBoostClassifier()), 
              ('Gradientboot classifier',GradientBoostingClassifier()),
              ('MLP', MLPClassifier())
             ]

trained_models = []
model_and_score = {}

for index, tup in enumerate(algorithms):
    model = prepare_model(tup[1])
    model_and_score[tup[0]] = str(model.score(X_train,y_train)*100)+"%"
    trained_models.append((tup[0],model))

## Model Evaluation

In [None]:
print(model_and_score)

In [None]:
for index, tup in enumerate(trained_models):
    prepare_confusion_matrix(tup[0], tup[1])

In [None]:
for index, tup in enumerate(trained_models):
    prepare_classification_report(tup[0], tup[1])
    print("\n")

In [None]:
encoder = LabelEncoder()
y_test = encoder.fit_transform(y_test)

for index, tup in enumerate(trained_models):
    prepare_roc_curve(tup[0], tup[1])

In [None]:
# input_data = {
#     "Age": [40],
#     "BusinessTravel": ['Non-Travel'],
#     "DailyRate": [1142],
#     "Department": ['Research & Development'],
#     "DistanceFromHome": [8],
#     "Education": [2],
#     "EducationField": ['Life Sciences'],
#     "EmployeeCount": [1],
#     "EmployeeNumber": [1552],
#     "EnvironmentSatisfaction": [4],
#     "Gender": ['Male'],
#     "HourlyRate": [72],
#     "JobInvolvement": [3],
#     "JobLevel": [2],
#     "JobRole": ['Healthcare Representative'],
#     "JobSatisfaction": [4],
#     "MaritalStatus": ['Divorced'],
#     "MonthlyIncome": [4069],
#     "MonthlyRate": [8841],
#     "NumCompaniesWorked": [3],
#     "Over18": ['Y'],
#     "OverTime": ['Yes'],
#     "PercentSalaryHike": [18],
#     "PerformanceRating": [3],
#     "RelationshipSatisfaction": [3],
#     "StandardHours": [80],
#     "StockOptionLevel": [0],
#     "TotalWorkingYears": [8],
#     "TrainingTimesLastYear": [2],
#     "WorkLifeBalance": [3],
#     "YearsAtCompany": [2],
#     "YearsInCurrentRole": [2],
#     "YearsSinceLastPromotion": [2],
#     "YearsWithCurrManager": [2]
# }

