## Feature Selection for modeling Customer Purchase Propensity and Expected Order Value of the Purchase

At this stage, I have the data for each month starting from 09/2016 to 10/2018. It is at month level. The last month of 10/2018 contains all the historical data, hence I will pick the latest month for checking feature-wise correlations.

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("../../Data/processed/customer_snapshots/10-2018/customer_unique_snapshot.csv")

In [4]:
numerical_columns = [
 'num_orders',
 'tot_pymt_sqntl',
 'avg_pymt_instllmnt',
 'tot_pymt_val',
 'tot_pymt_boleto',
 'tot_pymt_credit_card',
 'tot_pymt_debit_card',
 'tot_pymt_not_defined',
 'tot_pymt_voucher',
 'num_rev',
 'avg_rev_score',
 'avg_rev_title_length',
 'avg_rev_length',
 'days_since_lst_rev_creation',
 'num_products',
 'num_sellers',
 'avg_order_size',
 'tot_order_price',
 'avg_order_price',
 'tot_order_freight_value',
 'avg_order_freight_value',
 'tot_order_value',
 'avg_order_value',
 'days_since_last_shipped',
 'days_since_lst_order_purchased',
 'days_since_lst_order_approved',
 'days_since_lst_order_delivered_carrier',
 'days_since_lst_order_delivered_cust',
 'days_since_lst_shipping_llimit_date',
 'num_orders_approved',
 'num_orders_canceled',
 'num_orders_created',
 'num_orders_delivered',
 'num_orders_invoiced',
 'num_orders_processing',
 'num_orders_shipped',
 'num_orders_unavailable',
 'tot_pymt_val_approved',
 'tot_pymt_val_canceled',
 'tot_pymt_val_created',
 'tot_pymt_val_delivered',
 'tot_pymt_val_invoiced',
 'tot_pymt_val_processing',
 'tot_pymt_val_shipped',
 'tot_pymt_val_unavailable',
 'num_rev_approved',
 'num_rev_canceled',
 'num_rev_created',
 'num_rev_delivered',
 'num_rev_invoiced',
 'num_rev_processing',
 'num_rev_shipped',
 'num_rev_unavailable',
 'num_products_approved',
 'num_products_canceled',
 'num_products_created',
 'num_products_delivered',
 'num_products_invoiced',
 'num_products_processing',
 'num_products_shipped',
 'num_products_unavailable',
 'avg_order_size_approved',
 'avg_order_size_canceled',
 'avg_order_size_created',
 'avg_order_size_delivered',
 'avg_order_size_invoiced',
 'avg_order_size_processing',
 'avg_order_size_shipped',
 'avg_order_size_unavailable',
 'tot_order_price_approved',
 'tot_order_price_canceled',
 'tot_order_price_created',
 'tot_order_price_delivered',
 'tot_order_price_invoiced',
 'tot_order_price_processing',
 'tot_order_price_shipped',
 'tot_order_price_unavailable',
 'tot_order_freight_value_approved',
 'tot_order_freight_value_canceled',
 'tot_order_freight_value_created',
 'tot_order_freight_value_delivered',
 'tot_order_freight_value_invoiced',
 'tot_order_freight_value_processing',
 'tot_order_freight_value_shipped',
 'tot_order_freight_value_unavailable',
 'tot_order_value_approved',
 'tot_order_value_canceled',
 'tot_order_value_created',
 'tot_order_value_delivered',
 'tot_order_value_invoiced',
 'tot_order_value_processing',
 'tot_order_value_shipped',
 'tot_order_value_unavailable'
]

categorical_columns = [
 'customer_city',
 'customer_state',
 'customer_zip_code_prefix',
 'pref_prod_category',
 'pref_prod_category_english',
]

In [5]:
threshold = 0.7

corr = df[numerical_columns].corr(method="pearson")

excluded = set()

for col in numerical_columns:
    if col == "num_orders":
        continue

    coeff = corr.loc[col, "num_orders"]

    if abs(coeff) > threshold:
        print(f"{col}  <->  num_orders : {coeff:.3f}")
        excluded.add(col)

cols = [c for c in corr.columns if c != "num_orders"]

for i, col1 in enumerate(cols):
    if col1 in excluded:
        continue

    for col2 in cols[i + 1:]:
        if col2 in excluded:
            continue

        coeff = corr.loc[col1, col2]

        if abs(coeff) > threshold:
            print(f"{col1}  <->  {col2} : {coeff:.3f}")
            excluded.add(col2)

print("\nExcluded features:")
print(sorted(excluded))

num_rev  <->  num_orders : 0.866
num_sellers  <->  num_orders : 0.812
num_orders_delivered  <->  num_orders : 0.761
avg_pymt_instllmnt  <->  avg_order_size_approved : -1.000
tot_pymt_val  <->  tot_pymt_credit_card : 0.845
tot_pymt_val  <->  tot_order_price : 0.981
tot_pymt_val  <->  avg_order_price : 0.962
tot_pymt_val  <->  tot_order_value : 0.985
tot_pymt_val  <->  avg_order_value : 0.966
tot_pymt_val  <->  tot_pymt_val_delivered : 0.960
tot_pymt_val  <->  tot_order_price_delivered : 0.957
tot_pymt_val  <->  tot_order_value_delivered : 0.960
days_since_lst_rev_creation  <->  days_since_last_shipped : 0.998
days_since_lst_rev_creation  <->  days_since_lst_order_purchased : 0.999
days_since_lst_rev_creation  <->  days_since_lst_order_approved : 0.999
days_since_lst_rev_creation  <->  days_since_lst_order_delivered_carrier : 0.999
days_since_lst_rev_creation  <->  days_since_lst_order_delivered_cust : 0.999
days_since_lst_rev_creation  <->  days_since_lst_shipping_llimit_date : 0.998
nu

In [8]:
len(excluded)

56

In [9]:
print("Remaining Numerical Features: ", len(numerical_columns) - len(excluded))

Remaining Numerical Features:  37


In [10]:
for feat in numerical_columns:
    if feat not in excluded:
        print(feat)

num_orders
tot_pymt_sqntl
avg_pymt_instllmnt
tot_pymt_val
tot_pymt_boleto
tot_pymt_debit_card
tot_pymt_not_defined
tot_pymt_voucher
avg_rev_score
avg_rev_title_length
avg_rev_length
days_since_lst_rev_creation
num_products
avg_order_size
tot_order_freight_value
num_orders_approved
num_orders_canceled
num_orders_created
num_orders_invoiced
num_orders_processing
num_orders_shipped
num_orders_unavailable
tot_pymt_val_canceled
tot_pymt_val_invoiced
tot_pymt_val_processing
tot_pymt_val_shipped
tot_pymt_val_unavailable
num_rev_delivered
num_products_created
num_products_unavailable
avg_order_size_created
avg_order_size_unavailable
tot_order_price_created
tot_order_price_unavailable
tot_order_freight_value_canceled
tot_order_freight_value_created
tot_order_value_created


In [22]:
categorical_columns

['customer_city',
 'customer_state',
 'customer_zip_code_prefix',
 'pref_prod_category',
 'pref_prod_category_english']

### Key Observations:

- 37 numerical features are selected based on correlation based filtering.
- 2 categorical features are selected customer_zip_code_prefix and pref_prod_category_english