In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from skopt.space import Real, Categorical, Integer
from scipy.stats import uniform, loguniform

In [None]:
features = pd.read_csv('/kaggle/input/amazon-employee-access-challenge/train.csv')
test = pd.read_csv('/kaggle/input/amazon-employee-access-challenge/test.csv')

In [None]:
ids = test['id']
test.drop(['id','RESOURCE','ROLE_CODE'], axis=1, inplace=True)

In [None]:
y = features['ACTION']
X = features.drop(['ACTION','RESOURCE',"ROLE_CODE"], axis=1, inplace=False)

In [None]:
for cols in X.columns:
    plt.scatter(X[cols].tolist(),y.tolist())
    plt.xlabel(cols)
    plt.show()

In [None]:
plot_database = pd.concat([X,y], axis=1)
sb.pairplot(plot_database)

In [None]:
for col in X.columns:
    plt.figure()
    sb.boxplot(features[col])

In [None]:
for col in X.columns:
    plt.figure()
    sb.catplot('ACTION',col,data=features)

In [None]:
cols = X.columns

In [None]:
X['new_mgr'] = np.where(X['MGR_ID']>=140000,0,1)
test['new_mgr'] = np.where(test['MGR_ID']>=140000, 0, 1)
X['new_role_title'] = np.where(X['ROLE_TITLE'] >=190000 , 0, 1)
X['new_role_title'] = np.where(X['ROLE_TITLE'] <=260000 , X['new_role_title'], 1)
test['new_role_title'] = np.where(test['ROLE_TITLE']>=190000, 0, 1)
test['new_role_title'] = np.where(test['ROLE_TITLE']<=260000, test['new_role_title'], 1)
X['new_roll1'] = np.where(X['ROLE_ROLLUP_1']>=135000,0,1)
X['new_roll1'] = np.where(X['ROLE_ROLLUP_1']<=80000,0,X['new_roll1'])
test['new_roll1'] = np.where(test['ROLE_ROLLUP_1']>=135000,0,1)
test['new_roll1'] = np.where(test['ROLE_ROLLUP_1']<=80000,0,test['new_roll1'])
X['new_roll2'] = np.where(X['ROLE_ROLLUP_2']>=140000,0,1)
X['new_roll2'] = np.where(X['ROLE_ROLLUP_2']<=110000,0,X['new_roll2'])
test['new_roll2'] = np.where(test['ROLE_ROLLUP_2']>=140000,0,1)
test['new_roll2'] = np.where(test['ROLE_ROLLUP_2']<=110000,0,test['new_roll2'])
X['new_roled'] = np.where(X['ROLE_DEPTNAME']>=150000,0,1)
X['new_roled'] = np.where(X['ROLE_DEPTNAME']<=110000,0,X['new_roled'])
test['new_roled'] = np.where(test['ROLE_DEPTNAME']>=150000,0,1)
test['new_roled'] = np.where(test['ROLE_DEPTNAME']<=110000,0,test['new_roled'])

In [None]:
combined = pd.concat([X,test], axis=0)
combined

In [None]:
## finding the number of unique values for various fields
for col in cols:
    print(col,end=" ")
    print(combined[col].nunique())

## Conclusions after data analysis
### Feature Removed
1) RESOURCES: This varaible covered a large range of values hence was prone to error, therefore it was removed.<br>
2) ROLE_CODE: I ran a basic code to run the effect of various coulmns on the result and this column was found erroneous,               hence it was removed
### Features Added
Various columns were added to the dataset based in the observations from the various graphs. Another field was created assigning boolean values for the ranges where a particular field showed extremely less negative results. The observations were made from the box plot and the cat plots.
### Using Regression Models
After trying out various classfiations techniques and algorithms, I could only reach a score of 0.73 but after using regression models, I could boost it upto 0.85. I think this was possible because, here as we can see through the various graphs no clear boundary between the positive and negative responses can be determined. Hence according to me, regression model should be more favourable.
### Categorisation of Data
After observing the small set of unique values for each set, it was quite clear that categorisation of data would have a positive impact on the predictions made by the model. Since, one hot encoding was not a feasible as it would form large number of fields hence increasing the train time by an enormous amount. Therefore I preferred ordinal encoding.

In [None]:
## encoding the data
encoder = OrdinalEncoder()
combined[cols] = encoder.fit_transform(combined[cols])
X[cols] = encoder.transform(X[cols])
test[cols] = encoder.transform(test[cols])
X

In [None]:
# ## hyper parameter tuning for XGBRegressor using folds 
# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0, 0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [4, 5, 6, 7]
#         }
# # fit model no training data

# xgb = XGBRegressor(n_estimators = 5000, learning_rate=0.01, random_state=0)
# folds = 5
# param_comb = 5

# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 0)

# random_search = RandomizedSearchCV(xgb, param_distributions=params, 
#                                    n_iter=param_comb, scoring='roc_auc', 
#                                    n_jobs=4,cv=skf.split(X,y), verbose=3, 
#                                    random_state=1001 )
# random_search.fit(X, y)

# print('\n Best hyperparameters:')
# print(random_search.best_params_)
# print('\n Best estimator:')
# print(random_search.best_estimator_)

In [None]:
model = XGBRegressor(n_estimators = 5000, learning_rate=0.01, random_state=0)
model.fit(X, y,
          eval_metric='error',
          verbose=100)

In [None]:
predictions = model.predict(test)
output = pd.DataFrame({'Id': ids,
                       'Action': predictions})
output.to_csv('submission.csv', index=False)