In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from pycaret.classification import *
from shared_utilities import helpers
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
# from pandas_profiling import ProfileReport
from sklearn.decomposition import PCA

In [3]:
CLUSTER = "prod-app"
DATABASE = "stlrcanucks"
LKUPCLIENTID = "7"
SCORING_YEAR = 2022
PRODUCT_MAPPING = {'Mini Pack': 0, 'Quarter': 1, 'Half Season': 2, 'Full Season': 3}
PRODUCT_CURRENT_MAPPING = {'Individual': 0, 'Group': 1, 'Mini Pack': 2, 'Quarter': 3, 'Half Season': 4, 'Full Season': 5}

In [4]:
dataset = helpers.get_product_propensity_dataset(
    cluster=CLUSTER,
    database=DATABASE,
    lkupclientid=LKUPCLIENTID,
    scoring_year=SCORING_YEAR,
    type_flag=0
)

dataset.shape

Authorized as AROASQ4JELIXYLYV6P4UV:pmorrison@stellaralgo.com


(699126, 27)

In [None]:
features = [
    "atp_last",
    "attended_last",
    "distance",
    "events_last",
    "spend_current",
    "sends",
    "tenure",
    "opens",
    "product",
    "volume_current"
]

In [None]:
df = dataset.copy()

In [None]:
anti_product_plans = ["Individual", "Group", "None", None, np.nan]
df = df[~df["product"].isin(anti_product_plans)].reset_index(drop=True)
df.shape

## Helper Functions

In [None]:
def subset_by_iqr(df, column, whisker_bottom=1.5, whisker_top=1.5):
    # Calculate Q1, Q2 and IQR
    q1 = df[column].quantile(0.25)                 
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    # Apply filter with respect to IQR, including optional whiskers
    filter = (df[column] >= q1 - whisker_bottom*iqr) & (df[column] <= q3 + whisker_top*iqr)
    return df.loc[filter]       

## Main Functions

In [None]:
def analyze_dataset_pca(df):
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(df)

    df['pca-one'] = pca_result[:,0]
    df['pca-two'] = pca_result[:,1] 
    df['pca-three'] = pca_result[:,2]
    ax = plt.figure(figsize=(12,12)).gca(projection='3d')
    chart = ax.scatter(
        xs=df["pca-one"], 
        ys=df["pca-two"], 
        zs=df["pca-three"], 
        c=df["product_encoded"], 
        cmap='tab10'
    )
    ax.set_xlabel('pca-one')
    ax.set_ylabel('pca-two')
    ax.set_zlabel('pca-three')

    ax.legend(*chart.legend_elements())
    plt.show()

def dataset_pre_process(df):
    df = df.fillna(method="backfill")
    df = subset_by_iqr(df, 'atp_last', 1.0)
    df = subset_by_iqr(df, 'spend_current', 1.5, 1.0)
    df = subset_by_iqr(df, 'distance', 1.5, 1.5)
    df = subset_by_iqr(df, 'opens', 1.5, 1.0)
    df = subset_by_iqr(df, 'volume_current', 1.5, 1.0)
    df.shape