# Problem definition:
The data consists of real historical data collected from 2010 & 2011.  Employees are manually allowed or denied access to resources over time. You must create an algorithm capable of learning from this historical data to predict approval/denial for an unseen set of employees.
<img src="Amazon.jpg"/>


# Creation of base dataset

train.csv - The training set. Each row has the ACTION (ground truth), RESOURCE, and information about the employee's role at the time of approval

test.csv - The test set for which predictions should be made.  Each row asks whether an employee having the listed characteristics should have access to the listed resource.

<table><thead><td>Column Name</td><td>Column Description</td></thead>
    <tr><td>ACTION</td>	<td>ACTION is 1 if the resource was approved, 0 if the resource was not</td></tr>
<tr><td>RESOURCE</td>	<td>An ID for each resource</td></tr>
<tr><td>MGR_ID</td>	<td>The EMPLOYEE ID of the manager of the current EMPLOYEE ID record; an employee may have only one manager at a time</td></tr>
<tr><td>ROLE_ROLLUP_1</td>	<td>Company role grouping category id 1 (e.g. US Engineering)</td></tr>
<tr><td>ROLE_ROLLUP_2</td>	<td>Company role grouping category id 2 (e.g. US Retail)</td></tr>
<tr><td>ROLE_DEPTNAME</td>	<td>Company role department description (e.g. Retail)</td></tr>
<tr><td>ROLE_TITLE</td>	<td>Company role business title description (e.g. Senior Engineering Retail Manager)</td></tr>
<tr><td>ROLE_FAMILY_DESC</td>	<td>Company role family extended description (e.g. Retail Manager, Software Engineering)</td></tr>
<tr><td>ROLE_FAMILY</td>	<td>Company role family description (e.g. Retail Manager)</td></tr>
<tr><td>ROLE_CODE</td>	<td>Company role code; this code is unique to each role (e.g. Manager)</td></tr>
</table>

# Pre Processing

In [None]:
import os
print(os.listdir("../input/amazon-employee-access-challenge/"))

In [None]:
#Load the training dataset and importing basic packages
import pandas as pd
import numpy as np
trainDf = pd.read_csv('../input/amazon-employee-access-challenge/train.csv')
testDf = pd.read_csv('../input/amazon-employee-access-challenge/test.csv')

In [None]:
#Observing how first five rows look like of train dataset
trainDf.head()

In [None]:
#Observing test dataset
testDf.head()

## Memory management 

In [None]:
#Check the different columns and types of train Dataset
trainDf.dtypes

In [None]:
#Check the different columns and types of test Dataset
testDf.dtypes

In [None]:
#Checking volume of train dataset
trainDf.shape

In [None]:
#Checking volume of test dataset
testDf.shape

## Null value treatment 

In [None]:
#Check the null values 
trainDf.isna().sum()

## Outlier treatment 

In [None]:
for i in trainDf.columns:
    print(i, trainDf[i].nunique())

In [None]:
num_cont, num_desc = [],[]
for i in trainDf.columns:
    if trainDf[i].nunique() > 350:
        num_cont.append(i)
    else:
        num_desc.append(i)
print(num_cont)
print(num_desc)

In [None]:
len(list(trainDf.columns))

## Univariate analysis (EDA) 

In [None]:
#Checking what kind of value does the Resource Column have
import seaborn as sns
import matplotlib.pyplot as plt
for i in trainDf.var().index:
    sns.distplot(trainDf[i],kde=False)
    plt.show()

## BiVariate Analysis

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(trainDf.corr())

# Model Building

## Supervised

 ### Classification

In [None]:
#importing the Libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier

In [None]:
#Setting Y and X 
y=trainDf['ACTION']
x=trainDf.drop('ACTION',axis=1)

In [None]:
#Split data set into into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=43)

In [None]:
#Display the shape of the train and test datasets
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

In [None]:
#Which are models used for the classifier
models=[DecisionTreeClassifier(),RandomForestClassifier(),BaggingClassifier(),XGBClassifier(),]

In [None]:
#import libraries for confusion matrix and accuracy score
from sklearn.metrics import confusion_matrix,accuracy_score
final_accuracy_scores=[]
#Iterate each model
for i in models:
    dt=i
    #Make the model suitable for X_train and Y_train
    dt.fit(X_train,y_train)
    #Predict the test dataset
    dt.predict(X_test)
    dt.predict(X_train)
    print('Model used for predicting')
    print(i)
    print("Confusion matrix of test dataset")
    print(confusion_matrix(y_test,dt.predict(X_test)))
    print('Accuracy Score of test dataset')
    print(accuracy_score(y_test,dt.predict(X_test)))
    print(confusion_matrix(y_train,dt.predict(X_train)))
    print(accuracy_score(y_train,dt.predict(X_train)))
    final_accuracy_scores.append([i,confusion_matrix(y_test,dt.predict(X_test)),accuracy_score(y_test,dt.predict(X_test)),confusion_matrix(y_train,dt.predict(X_train)),accuracy_score(y_train,dt.predict(X_train))])
    from sklearn.model_selection import cross_val_score
    #Crossfold Validation score for each model
    print(cross_val_score(i,X_train,y_train,cv=10))
    print('**************************************************************************************************')
    

<img src='ConfusionMatrix.png' />

In [None]:
for i in range(len(final_accuracy_scores)):
    a= final_accuracy_scores[i]
    #Sensitivity of the model
    cMatrix = a[1]
    #Sensitivity = True Positive Rate = TP/(TP+FN)--(Condition positive)
    Sensitivity = cMatrix[0][0]/(cMatrix[0][0]+cMatrix[1][0])
    #Specificity = True Negative Rate = TN/(FP+TN)-- Condition Negative
    Specificity = cMatrix[1][1]/(cMatrix[1][1]+cMatrix[0][1])
    print(a[0])
    print("Sensitivity of Model ", Sensitivity)
    print("Specificity of Model", Specificity)

# Test Data Submission Result

<p>Above all models I found RandomTreeClassifier as having better prediction accuracy score. So I will be using RandomTreeClassifier</p>

In [None]:
#Check the test dataset 2 rows
testDf.head(2)

In [None]:
#Drop the column id
testx = testDf.drop(['id'],axis=1)

In [None]:
#Using the Bagging Classifier model as providing good a accuracy
model = BaggingClassifier()
model.fit(x,y)


In [None]:
#Predicting the test data
testy=model.predict(testx)

In [None]:
#Getting the test data in the series
Action = pd.Series(testy)

In [None]:
#Combine the id and action to show the results
results = pd.DataFrame({'id':testDf['id'],'Action':Action})

In [None]:
#Storing the results in the file
results.to_csv("Submission.csv",index=False)

In [None]:
results.shape

In [None]:
results.shape[0]-trainDf.shape[0]

In [None]:
from xgboost import XGBClassifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
catboost.CatBoostClassifier