In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def one_hot_encoder(dataframe, col_name):
    return pd.concat([dataframe, pd.get_dummies(dataframe[col_name])], axis=1).drop([col_name],axis=1)

The above function returns a dataframe with the given column being dropped and its categories all one-hot-encoded

In [None]:
def new_feature(og_name,new_name,thresh,train):
    train.loc[train[og_name] > 150000, new_name] = 1   
    train.loc[train[og_name] <= 150000, new_name] = 0 
    return train

In [None]:
def one_hot_multi_cat(df,cat_name_list,col_name):
    try:
        new_df=pd.get_dummies(df[col_name])
        for cat in cat_name_list:
            df[str(cat)]=new_df[cat]
        return df
    except:
        pass

In [None]:
def most_freq_cats(df,n,col_name):
    return df[col_name].value_counts()[:n].index.tolist()

In [None]:
def bar_plotter(df,col_name):
    index_list=df[col_name].value_counts().index.tolist()
    for i in range(len(index_list)):
        index_list[i]=str(index_list[i])
    info=list(df[col_name].value_counts())
    for i in range(len(info)):
        info[i]=(info[i]/df.shape[0])*100
    plt.bar(index_list[:8], info[:8], color ='maroon',
        width = 0.4)
    plt.xlabel(col_name)
    plt.ylabel("Percentage")
    plt.title("")
    plt.show()

In [None]:
train=pd.read_csv("../input/amazon-employee-access-challenge/train.csv")
actual_train=train
test=pd.read_csv("../input/amazon-employee-access-challenge/test.csv")
positive_df_mask=train["ACTION"]==1
negative_df_mask=train["ACTION"]==0
positive_df=train[positive_df_mask]
negative_df=train[negative_df_mask]
y_train=train["ACTION"]
x_train=train.drop(["ACTION"],axis=1)
train=pd.concat([x_train,test])

In [None]:
train.nunique()

In all of these columns, there are a large number of unique values. So right off that bat, we will drop the ids since they are unique for everyone. And we will also drop the "RESOURCE" column. The reasoning is that the large number of categories makes it impossible for any generalization to take place for any one of the categories because of the very few available examples. The other columns will be dealt with one by one,

In [None]:
train_corr=actual_train.corr()
train_corr['ACTION'].sort_values(ascending=False)

ROLE_CODE, ROLE_ROLLUP_1 have the highest magnitude of correlation and we must focus on them a bit more.

In [None]:
bar_plotter(negative_df,"ROLE_ROLLUP_1")
bar_plotter(positive_df,"ROLE_ROLLUP_1")

The number of categories in both "ROLE_ROLLUP_1" and "ROLL_ROLLUP_2" are reasonably small. So we can one-hot-encode the most common as a large percentage of the categories will provide important information. 

In [None]:
cats=most_freq_cats(train,30,"ROLE_ROLLUP_1")
train=one_hot_multi_cat(train,cats,"ROLE_ROLLUP_1")

In [None]:
bar_plotter(negative_df,"RESOURCE")
bar_plotter(positive_df,"RESOURCE")

As stated earlier, the percentegae for even the highest of the categories is quite small.

In [None]:
bar_plotter(negative_df,"ROLE_ROLLUP_2")
bar_plotter(positive_df,"ROLE_ROLLUP_2")

There are more significant categories here so as stated earlier, we will one-hot-encode this entire column.

In [None]:
cats=most_freq_cats(train,30,"ROLE_ROLLUP_2")
train=one_hot_multi_cat(train,cats,"ROLE_ROLLUP_2")

In [None]:
bar_plotter(negative_df,"ROLE_DEPTNAME")
bar_plotter(positive_df,"ROLE_DEPTNAME")

Here we cannot claim that the categories will not have significant number of examples, nor do we wish to one-hot-enode all of this column as the number of unique values is still very large, so we will just pick the 20 most frequent in the dataset and one-hot=encode them.

In [None]:
cats=most_freq_cats(train,20,"ROLE_DEPTNAME")
train=one_hot_multi_cat(train,cats,"ROLE_DEPTNAME")

In [None]:
bar_plotter(negative_df,"ROLE_CODE")
bar_plotter(positive_df,"ROLE_CODE")

Again there seem to be a large number of categories so we will pick up 50 of the most common and encode them since ROLE_CODE has the highest correlation with the target.

In [None]:
cats=most_freq_cats(train,20,"ROLE_CODE")
train=one_hot_multi_cat(train,cats,"ROLE_CODE")

In [None]:
plt.figure(figsize=(9, 6))
sns.catplot('ACTION', 'ROLE_CODE', data=actual_train)

On further analyzing ROLE_CODE we can see that a new feature can be created, which will be 1 if the "ROLE_CODE" is above a certain value

In [None]:
train=new_feature("ROLE_CODE","1",140000,train)

In [None]:
bar_plotter(negative_df,"ROLE_TITLE")
bar_plotter(positive_df,"ROLE_TITLE")

Both "ROLE_CODE" and "ROLE_TITLE" have the same number of unique values and their graphs look exactly the same. There is a one-to-one correspondence between the elements of these two columns. Hence one of them is extraneous and can be removed.

In [None]:
bar_plotter(negative_df,"ROLE_FAMILY")
bar_plotter(positive_df,"ROLE_FAMILY")

In [None]:
cats=most_freq_cats(train,20,"ROLE_FAMILY")
train=one_hot_multi_cat(train,cats,"ROLE_FAMILY")

In [None]:
train=train.drop(["id","RESOURCE","ROLE_TITLE"],axis=1)

Now let us look at the "MGR_ID" part

In [None]:
plt.figure(figsize=(9, 6))
sns.catplot('ACTION', 'MGR_ID', data=actual_train)

We observe that it is possible to create another category denoting if the "MGR_ID" is above a certain threshold.

In [None]:
train=new_feature("MGR_ID","MGR_thresh",140000,train)

In [None]:
from xgboost import XGBRegressor
# Make a decision tree and train
tree = XGBRegressor()
train=pd.DataFrame(np.asarray(train))
tree.fit(train[:x_train.shape[0]], y_train)

We are using the XGBRegressor here, This was an empirical choice as many other classifiers were tried and this gave a better score consistently, robably beacuse it uses an ensemble.
A single decision tree Regressor gives a score of approx 0.81147.
A random forest with 1000 trees does not even cross 0.64 even with the same paramenters. 
A possible explanation I have come up with is this. In the given dataset there are some features that are much more important than others. While in a Decision Tree they get significant importance, in a Random Forest it is possible that because of the randomness in feature selection, the more important features get drowned out by other less important and much more numerous features. 
However this may as much be a case of the way this particular data analysis has been done, and a different procedure may yield the usual results of a Random Forest exceeding a single Decision Tree in accuracy.
two hyper-parameters that have been set were found out empirically by multiple trials.

In [None]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(tree, open(filename, 'wb'))

In [None]:
predictions=tree.predict(train[x_train.shape[0]:])
sub_df=pd.read_csv("../input/amazon-employee-access-challenge/sampleSubmission.csv")
sub_df['Action'] = predictions
sub_df.to_csv("submission.csv",index=False)

Some components have been inspired by:
https://www.kaggle.com/kickitlikeshika/employee-access-eda-data-cleaning#EDA