In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

# We can set the category here to predict in the second model
target_category = 'computers_accessories'

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 125)

def profile_col(data, col):
    print("\n{} has {:,} unique values (out of {:,} total)".format(
        col, data[col].nunique(), data.shape[0]))
    print("Value counts:\n{}".format(data[col].value_counts()[:20]))


In [None]:
# load data sets
path = '/kaggle/input/applied-ml-microcourse-ecommerce-recommendation/'

order_items = pd.read_csv('{}olist_order_items_dataset.csv'.format(path))
orders = pd.read_csv('{}olist_orders_dataset.csv'.format(path))
customers = pd.read_csv('{}olist_customers_dataset.csv'.format(path))
order_payments = pd.read_csv('{}olist_order_payments_dataset.csv'.format(path))
sellers = pd.read_csv('{}olist_sellers_dataset.csv'.format(path))
reviews = pd.read_csv('{}olist_order_reviews_dataset.csv'.format(path))
products = pd.read_csv('{}olist_products_dataset.csv'.format(path))
translation = pd.read_csv('{}product_category_name_translation.csv'.format(path))

In [None]:
print("{:,} unique orders".format(order_items['order_id'].nunique()))
print("{:,} rows (order items)".format(order_items['order_id'].count()))
order_items.head()

In [None]:
print("{:,} unique orders".format(orders['order_id'].nunique()))
orders.head()

In [None]:
print("{:,} unique order_id values".format(orders['order_id'].nunique()))
print("{:,} unique customer_id values".format(orders['customer_id'].nunique()))

In [None]:
print("{:,} unique customers".format(customers['customer_id'].nunique()))
print("{:,} unique unique_customer_ids".format(customers['customer_unique_id'].nunique()))
customers.head()

In [None]:
print("{:,} order payment lines".format(order_payments['order_id'].count()))
order_payments.head()

In [None]:
# how many sellers are there?
print('{:,} unique sellers'.format(sellers['seller_id'].nunique()))
sellers.head()

In [None]:
# How many products are there?
print('{:,} unique products'.format(products['product_id'].nunique()))
print('{:,} unique product categories'.format(products['product_category_name'].nunique()))
products.head()

In [None]:
translation.head()

In [None]:
reviews.head()

In [None]:
print("{:,} unique order_id values".format(reviews['order_id'].nunique()))
print("{:,} unique review_id values".format(reviews['review_id'].nunique()))
print("{:,} rows".format(reviews['review_id'].count()))
print("{:,} unique review comments".format(reviews['review_comment_message'].nunique()))

In [None]:
# Merge english category onto products
products = products.merge(translation, on='product_category_name')

## Define our label to predict which orders have multiple items

The data for whether an order has multiple items or not does not yet exist at the order level, so we will have to create this from order_items.

In [None]:
# create label for whether the order has multiple items
multi_items = order_items.groupby('order_id')['order_item_id'].max().to_frame().reset_index()
multi_items['label_multi_items'] = np.where(multi_items['order_item_id'] > 1, 1, 0)

multi_items.head()

In [None]:
multi_items.groupby('label_multi_items')['order_id'].count()

## Define the entity level for our feature set and build

In [None]:
# Create feature set at the level of the first order item added
data = order_items[order_items['order_item_id'] == 1]

In [None]:
# include product category information
data = data.merge(products[['product_id', 'product_category_name_english', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty',
                            'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']],
                  on='product_id')

# include product category information
data = data.merge(multi_items[['order_id', 'label_multi_items']], 
                  on='order_id')

# include order payments information
data = data.merge(order_payments[['order_id', 'payment_sequential', 
                                  'payment_type','payment_installments', 
                                  'payment_value']])

# merge customer with order 
customer_order = customers[['customer_id', 'customer_zip_code_prefix', 'customer_city','customer_state']].merge(
    orders[['order_id', 'customer_id']])

# include customer state and seller state (locations) 
data = data.merge(customer_order, 
                  on='order_id')
data = data.merge(sellers[['seller_id', 'seller_zip_code_prefix' ,'seller_city', 'seller_state']], 
                  on='seller_id')

In [None]:
# Reviews?
data_reviews = data.merge(reviews[['order_id', 'review_score']], on='order_id')

In [None]:
print("Data without reviews has shape {}".format(data.shape))
print("Data with reviews has shape {}".format(data_reviews.shape))

Including reviews has introduced some duplicates.  We could fix this, but there is a bigger question - would we have this data in model run time?

In [None]:
data_reviews[['order_id', 'shipping_limit_date']].head().sort_values('order_id')

In [None]:
reviews[reviews['order_id'].isin(data_reviews['order_id'].head())][['order_id', 'review_answer_timestamp']].head().sort_values('order_id')

As you might expect, reviews are made after the order has been placed.  We will not include this data.

Lastly, we will drop any id and date fields that are not features.

In [None]:
data.drop(columns=['order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'payment_sequential'], inplace=True)
data.head()

## Feature exploration and analysis

Let's start with the categorical variables first.

In [None]:
data['label_multi_items'].mean()

In [None]:
categorical_columns = ['product_category_name_english', 'payment_type', 'payment_installments', 'payment_type']
for column in categorical_columns:
    display(data.groupby(column)['label_multi_items'].mean().to_frame().sort_values('label_multi_items', ascending=False).head(10))

In [None]:
data.groupby(column)['label_multi_items'].mean().to_frame().sort_values('label_multi_items', ascending=False)

Now for the numeric columns

In [None]:
numeric_columns = data.dtypes[data.dtypes=='float64'].index.values
for column in numeric_columns:
    display(data.groupby('label_multi_items')[column].mean().to_frame())

In [None]:
import matplotlib.pyplot as plt

def plot_churn_hist(column):
    plt.figure()
    plt.hist(data[data['label_multi_items'] == 0][column], bins=20)
    plt.hist(data[data['label_multi_items'] == 1][column], bins=20)
    plt.title(column)

for column in ['payment_value', 'product_photos_qty', 'price', 'product_weight_g', 'product_description_lenght', 'freight_value']:
    plot_churn_hist(column)

## Transformations for modelling

We will only worry about creating one-hot encoded variables for now

In [None]:
features = ['customer_id', 'label_multi_items', 'payment_value', 'product_photos_qty', 'price', 'product_weight_g', 'product_description_lenght', 'freight_value',
           'product_category_name_english', 'payment_type', 'payment_installments']

data[features].head()

In [None]:
data_transformed = pd.get_dummies(data[features].drop(columns='customer_id'), drop_first=True)

In [None]:
print('Original data has shape {}'.format(data.shape))
print('Transformed data has shape {}'.format(data_transformed.shape))
data_transformed.head()