In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import metrics

def get_performance_measures(clf, X_train, X_test, y_train, y_test):
    test_pred = clf.predict(X_test)
    train_pred = clf.predict(X_train)
    
#     print("\nClassification report (test):\n", metrics.classification_report(y_test, test_pred))
    print("ROC AUC (test):\n", round(metrics.roc_auc_score(y_test, test_pred), 2))

#     print("\nClassification report (train):\n", metrics.classification_report(y_train, train_pred))
    print("ROC AUC (train):\n", round(metrics.roc_auc_score(y_train, train_pred), 2), "\n")

In [None]:
# load data sets
path = '/kaggle/input/applied-ml-microcourse-ecommerce-recommendation/'

data = pd.read_csv('{}olist_features.csv'.format(path))

In [None]:
# check out this cheat sheet for more pandas functions:
#  https://www.dataquest.io/blog/pandas-cheat-sheet/
data.info()

In [None]:
data.head()

In [None]:
data.columns.values

In [None]:
data.groupby('label_multi_items')['label_multi_items'].count()

Now we will create our training and test partitions

In [None]:
from sklearn.model_selection import train_test_split

y = data['label_multi_items']
X = data.drop(columns=['label_multi_items'])
X.fillna(0, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Logistic Regression Modelling

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)

get_performance_measures(clf_lr, X_train, X_test, y_train, y_test)

The warning reminds us that we should scale the data when using linear models.  Let's try this and retrain the model

In [None]:
from sklearn import preprocessing

X_train_scaled = preprocessing.scale(X_train)
X_test_scaled = preprocessing.scale(X_test)

clf_lr = LogisticRegression()
clf_lr.fit(X_train_scaled, y_train)

get_performance_measures(clf_lr, X_train_scaled, X_test_scaled, y_train, y_test)

## Random Forest Modelling


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)

get_performance_measures(clf_rf, X_train, X_test, y_train, y_test)

Strong overfitting to the training data (100%!).   Something is clearly amiss.  Let's change a hyperparameter and reduce the model's complexity

In [None]:
clf_rf = RandomForestClassifier(min_samples_leaf=10)
clf_rf.fit(X_train, y_train)

get_performance_measures(clf_rf, X_train, X_test, y_train, y_test)

In [None]:
import matplotlib.pyplot as plt

sorted_idx = clf_rf.feature_importances_.argsort()
sorted_idx = sorted_idx[::-1][:10][::-1]
plt.barh(X_train.columns[sorted_idx], clf_rf.feature_importances_[sorted_idx])
plt.xlabel("RF Feature Importance")

## XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

clf_xgb = XGBClassifier()
clf_xgb.fit(X_train, y_train)

get_performance_measures(clf_xgb, X_train, X_test, y_train, y_test)

In [None]:
clf_xgb = XGBClassifier(min_child_weight=5, max_depth=2)
clf_xgb.fit(X_train, y_train)

get_performance_measures(clf_xgb, X_train, X_test, y_train, y_test)

In [None]:
import matplotlib.pyplot as plt

sorted_idx = clf_xgb.feature_importances_.argsort()
sorted_idx = sorted_idx[::-1][:10][::-1]
plt.barh(X_train.columns[sorted_idx], clf_xgb.feature_importances_[sorted_idx])
plt.xlabel("XGBoost Feature Importance")

## Sense Check!

Remember that *payment_value* from the *order_payments* file is at the *order_id* level.  This may be a case of feature leakage. 

We should try removing this feature and retraining our models.  

In [None]:
data = pd.read_csv('{}olist_features.csv'.format(path))

y = data['label_multi_items']
X = data.drop(columns=['label_multi_items'])
X.fillna(0, inplace=True)

X.drop(columns=['payment_value', 'payment_installments'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_scaled = preprocessing.scale(X_train)
X_test_scaled = preprocessing.scale(X_test)

In [None]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train_scaled, y_train)
print("Logistic Regression Results:")
get_performance_measures(clf_lr, X_train_scaled, X_test_scaled, y_train, y_test)


clf_rf = RandomForestClassifier(min_samples_leaf=10)
clf_rf.fit(X_train, y_train)
print("Random Forest Results:")
get_performance_measures(clf_lr, X_train, X_test, y_train, y_test)


clf_xgb = XGBClassifier(min_child_weight=5, max_depth=2)
clf_xgb.fit(X_train, y_train)
print("XGBoost Results:")
get_performance_measures(clf_xgb, X_train, X_test, y_train, y_test)

In [None]:
sorted_idx = clf_xgb.feature_importances_.argsort()
sorted_idx = sorted_idx[::-1][:15][::-1]
plt.barh(X_train.columns[sorted_idx], clf_xgb.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")