General Imports

In [124]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from pycaret.classification import *
from shared_utilities import helpers
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
# from pandas_profiling import ProfileReport
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [202]:
def print_scores(y_test, predictions):
    print(f"MAE: {mean_absolute_error(y_test, predictions)}")
    print(f"F1: {f1_score(y_test, predictions, average=None)}")
    print(f"F1: {f1_score(y_test, predictions, average='weighted')}")
    print(f"Accuracy: {accuracy_score(y_test, predictions)}")

In [3]:
CLUSTER = "prod-app"
DATABASE = "stlrcanucks"
LKUPCLIENTID = "7"
SCORING_YEAR = 2022
PRODUCT_MAPPING = {'Mini Pack': 0, 'Quarter': 1, 'Half Season': 2, 'Full Season': 3}
PRODUCT_CURRENT_MAPPING = {'Individual': 0, 'Group': 1, 'Mini Pack': 2, 'Quarter': 3, 'Half Season': 4, 'Full Season': 5}


In [4]:
dataset = helpers.get_product_propensity_dataset(
    cluster=CLUSTER,
    database=DATABASE,
    lkupclientid=LKUPCLIENTID,
    scoring_year=SCORING_YEAR,
    type_flag=0
)

dataset.shape

Authorized as AROASQ4JELIXYLYV6P4UV:pmorrison@stellaralgo.com


(699126, 27)

In [210]:
df = dataset.copy()

In [211]:
features = [
    "atp_last",
    "attended_last",
    "distance",
    "events_last",
    "spend_current",
    "sends",
    "tenure",
    "opens",
    "product",
    "product_current",
    "volume_current"
]

df = df[features]

In [212]:
product_plans = [
   "Full Season",
   "Half Season",
   "Quarter",
   "Mini Pack"
]

In [213]:
df = df[df["product"].isin(product_plans)].reset_index(drop=True)
df.shape

(15924, 11)

In [214]:
df = df.replace("None", np.nan)
df = df.fillna(value=np.nan)

In [215]:
df["product"].value_counts()

Full Season    10186
Quarter         3526
Half Season     1778
Mini Pack        434
Name: product, dtype: int64

In [216]:
df.describe()

Unnamed: 0,atp_last,attended_last,distance,events_last,spend_current,sends,tenure,opens,volume_current
count,13323.0,13323.0,15924.0,13323.0,15924.0,15607.0,15924.0,15607.0,15924.0
mean,113.425457,26.010733,66.315119,30.412745,11475.43,46.370667,2.969731,67.569552,98.396697
std,63.077591,16.651856,319.442262,18.038207,64790.36,24.689792,1.623274,79.397185,717.966317
min,0.0,0.0,0.39,0.0,0.0,0.0,1.0,0.0,0.0
25%,66.575,10.0,5.27,11.0,2956.8,34.0,2.0,17.0,29.0
50%,120.23,33.0,12.6,44.0,8491.6,46.0,3.0,52.0,88.0
75%,152.25,41.0,33.6,45.0,13972.85,62.0,4.0,92.0,90.0
max,517.99,45.0,4491.64,45.0,4684388.0,141.0,6.0,1466.0,52140.0


In [217]:
df.isna().sum()

atp_last           2601
attended_last      2601
distance              0
events_last        2601
spend_current         0
sends               317
tenure                0
opens               317
product               0
product_current    6245
volume_current        0
dtype: int64

In [218]:
X = df.drop("product", axis=1)
df["product_encoded"] = LabelEncoder().fit_transform(df["product"])
y = df["product_encoded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                random_state=1121218)

In [219]:
df[["product_encoded", "product"]].value_counts()

product_encoded  product    
0                Full Season    10186
3                Quarter         3526
1                Half Season     1778
2                Mini Pack        434
dtype: int64

In [206]:
rf_test = RandomForestClassifier()


df_test = df.copy()
df_test = df_test.dropna()

X = df_test.drop("product", axis=1)
X.drop("product_encoded", inplace=True, axis=1)
df_test["product_encoded"] = LabelEncoder().fit_transform(df_test["product"])
y = df_test["product_encoded"]

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X, y, test_size=.3, 
                                                random_state=1121218)

rf_test.fit(X_train_t, y_train_t)
score = rf_test.score(X_test_t, y_test_t)
print(f"Accuracy: {score}")
pred = rf_test.predict(X_test_t)
print_scores(pred, y_test_t)

Accuracy: 0.8867684478371501
MAE: 0.20381679389312976
F1: [0.95234464 0.68930818 0.43093923 0.82295482]
F1: 0.8887457974053122
Accuracy: 0.8867684478371501


In [220]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

There are 9 numerical features: 

['atp_last', 'attended_last', 'distance', 'events_last', 'spend_current', 'sends', 'tenure', 'opens', 'volume_current']


In [221]:
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
print(f'There are {len(categorical_features)} categorical features:', '\n')
print(categorical_features)

# if "product" not in categorical_features:
#     categorical_features.append("product")

There are 1 categorical features: 

['product_current']


In [222]:
# ordinal_features = X_train.select_dtypes(exclude='number').columns.tolist()
# print(f'There are {len(ordinal_features)} ordinal features:', '\n')
# print(ordinal_features)

# Pipeline

## Defining the Pipeline

Pipeline Imports https://towardsdatascience.com/how-to-use-sklearn-pipelines-for-ridiculously-neat-code-a61ab66ca90d

In [223]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV

In [224]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# ordinal_pipeline = Pipeline(steps=[
#     ('impute', SimpleImputer(strategy='most_frequent')),
#     ('ordinal', OrdinalEncoder(handle_unknown='ignore', spare=False))
# ])

In [225]:
full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features),
    # ('ordinal', ordinal_pipeline, ordinal_features)
])

### Test Pipeline by Building a Model

In [226]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# model = SVC(decision_function_shape='ovo')

model = RandomForestClassifier()

model_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', model)
])

In [227]:
_ = model_pipeline.fit(X_train, y_train)

In [228]:
pred = model_pipeline.predict(X_test)

In [229]:
print_scores(y_test, pred)

MAE: 0.2111762243616576
F1: [0.95077678 0.71203156 0.42056075 0.82693177]
F1: 0.8829914785582781
Accuracy: 0.8848890749267476


In [230]:
model_pipeline.score(X_test, y_test)

0.8848890749267476

## Now we can use the model pipeline elsewhere

In [231]:
model_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocess', 'model', 'preprocess__n_jobs', 'preprocess__remainder', 'preprocess__sparse_threshold', 'preprocess__transformer_weights', 'preprocess__transformers', 'preprocess__verbose', 'preprocess__number', 'preprocess__category', 'preprocess__number__memory', 'preprocess__number__steps', 'preprocess__number__verbose', 'preprocess__number__impute', 'preprocess__number__scale', 'preprocess__number__impute__add_indicator', 'preprocess__number__impute__copy', 'preprocess__number__impute__fill_value', 'preprocess__number__impute__missing_values', 'preprocess__number__impute__strategy', 'preprocess__number__impute__verbose', 'preprocess__number__scale__copy', 'preprocess__number__scale__feature_range', 'preprocess__category__memory', 'preprocess__category__steps', 'preprocess__category__verbose', 'preprocess__category__impute', 'preprocess__category__one-hot', 'preprocess__category__impute__add_indicator', 'preprocess__category__impute__copy', 'pr

In [232]:
param_dict = {'model__n_estimators': np.arange(10, 150, 10)}

search = GridSearchCV(model_pipeline, param_dict, 
                      cv=10, 
                      scoring='accuracy')

_ = search.fit(X_train, y_train)

In [233]:
print('Best score:', abs(search.best_score_))
print('Best alpha:', search.best_params_)

Best score: 0.8769963207767428
Best alpha: {'model__n_estimators': 70}
