# Importing Library and loading Data

In [None]:
!pip install pyforest

In [None]:
from pyforest import *
import warnings

warnings.simplefilter("ignore")

In [None]:
traindf = pd.read_csv('../input/amazon-employee-access-challenge/train.csv')
traindf.head()

In [None]:
testdf = pd.read_csv('../input/amazon-employee-access-challenge/test.csv')
testdf.head()

In [None]:
print('traindf.shape',traindf.shape)
print('testdf.shape',testdf.shape)

In [None]:
traindf.apply(lambda x: len(x.unique()))

In [None]:
traindf['MGR_ID'].value_counts()

# Distribution of Features

## Boxplot

In [None]:
traindf['ACTION'].value_counts()

In [None]:
for i in traindf.describe().columns:
    sns.boxplot(traindf[i].dropna())
    plt.show()

In [None]:
plt.figure(figsize=(13,7))
sns.heatmap(traindf.corr(),annot=True,cmap='viridis',linewidth=1);

# Model Building

In [None]:
y = traindf['ACTION']
X= traindf.drop('ACTION', axis=1)
X_test = testdf.drop('id',axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=1) 

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
%%time

params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'verbose':200,
          'random_seed':1}
catmodel_1 = CatBoostClassifier(**params)
catmodel_1.fit(X_train, y_train, eval_set=(X_valid, y_valid), use_best_model=True);

In [None]:
categorical_features = list(range(X.shape[1]))
print(categorical_features)

In [None]:
%%time

params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'cat_features': categorical_features,
          'verbose':200,
          'random_seed':1}
catmodel_1 = CatBoostClassifier(**params)
catmodel_1.fit(X_train, y_train, eval_set=(X_valid, y_valid), use_best_model=True);

## Cross Validation

In [None]:
from catboost import cv


params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'verbose':200,
          'random_seed':1}

total_train_data = Pool(data=X,
                        label=y,
                        cat_features = categorical_features)

scores = cv(pool=total_train_data,
            params=params,
            fold_count = 4,
            seed = 1,
            shuffle = True,
            stratified = True)

In [None]:
feature_imp = catmodel_1.get_feature_importance(prettified=True)
feature_imp

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6));
sns.barplot(x='Importances', y='Feature Id', data=feature_imp)

In [None]:
predictions=catmodel_1.predict_proba(X_test)
predictions

In [None]:
predictions=catmodel_1.predict(X_test)
print(predictions)

In [None]:
print(catmodel_1.score(X_valid, y_valid)*100)

In [None]:
Predictive_Model = pd.DataFrame({
        "Id": testdf['id'],
        "Action": catmodel_1.predict(X_test)})
Predictive_Model.to_csv('amazon_predict.csv', index=False)