# Score Distribution Example
* StelllarAlgo Data Science
* Ryan Kazmerik & Nakisa Rad
* Dec 20, 2021

In [1]:
import getpass
import pyodbc
import pandas as pd
import warnings
import matplotlib.pyplot as plt

from pycaret.classification import *

warnings.filterwarnings('ignore')

### Let's connect to MSSQL and run a stored proc to get our dataset:

In [2]:
# connect to SQL Server.
SERVER = '52.44.171.130' 
DATABASE = 'datascience' 
USERNAME = 'dsAdminWrite' 
PASSWORD = getpass.getpass(prompt='Enter your password')
CNXN = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+SERVER+';DATABASE='+DATABASE+';UID='+USERNAME+';PWD='+ PASSWORD)

Enter your password ····················


In [3]:
lkupclientid = 5 # blazers
cursor = CNXN.cursor()

storedProc = (
    f"""Exec [stlrTrailBlazers].[ds].[getRetentionScoringModelData] {lkupclientid}"""
)

df = pd.read_sql(storedProc, CNXN)

# apply some data transformations
df["year"] = pd.to_numeric(df["year"])

CNXN.commit()
cursor.close()

df.shape

(27487, 56)

### We should specify the features used in our model:

In [4]:
# choose the features for the stellar base retention model
features = [
    "dimCustomerMasterId",
    "attendancePercent",
    "distToVenue",
    "isNextYear_Buyer",
    "productGrouping",
    "recency",
    "source_tenure",
    "totalSpent",
    "year"
]

# copy your main dataframe
df_dataset = df

# choose the features & train year & test year
df_dataset = df_dataset[features]
df_dataset["year"] = pd.to_numeric(df_dataset["year"])
df_dataset = df_dataset.loc[df_dataset["year"] <= 2019]

df_train = df_dataset.sample(frac=0.85, random_state=786)
df_eval = df_dataset.drop(df_train.index)

df_train.reset_index(drop=True, inplace=True)
df_eval.reset_index(drop=True, inplace=True)

# print out the number of records for training and eval
print('Data for Modeling: ' + str(df_train.shape))
print('Unseen Data For Predictions: ' + str(df_eval.shape), end="\n\n")

Data for Modeling: (18607, 9)
Unseen Data For Predictions: (3284, 9)



### Now we can model the data using a binary classification prediction for the isnextyear_buyer field to see how likely a customer is to re-purchase.

In [5]:
setup(
    data= df_train, 
    target="isNextYear_Buyer", 
    train_size = 0.85,
    data_split_shuffle=True,
    ignore_features=["dimCustomerMasterId","productGrouping","year"],
    silent=True,
    verbose=False,
    numeric_features=[
        "attendancePercent",
        "distToVenue",
        "recency",
        "source_tenure",
        "totalSpent"
    ]
);

### Let's compare all models to evaluate performance. The output prints a score grid that shows average Accuracy, AUC, Recall, Precision, F1 and Kappa accross the folds (10 by default) along with training time.

In [6]:
model_matrix = compare_models(
    fold=10,
    include=["lr", "xgboost"]
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.767,0.8315,0.8366,0.8069,0.8214,0.4865,0.4873,0.373
lr,Logistic Regression,0.7366,0.8009,0.8595,0.7608,0.807,0.3971,0.4048,0.287


In [7]:
best_model = create_model(model_matrix)
final_model = finalize_model(best_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7642,0.8272,0.8323,0.8061,0.819,0.4811,0.4816
1,0.7566,0.8254,0.8304,0.7981,0.8139,0.4627,0.4635
2,0.7661,0.841,0.8245,0.8132,0.8188,0.4891,0.4892
3,0.7503,0.8149,0.8146,0.7996,0.807,0.4535,0.4537
4,0.7617,0.8212,0.8402,0.7985,0.8188,0.4714,0.4727
5,0.7818,0.8346,0.849,0.8175,0.8329,0.5188,0.5195
6,0.7596,0.8266,0.8233,0.8058,0.8145,0.4735,0.4737
7,0.771,0.8426,0.8411,0.8091,0.8248,0.4948,0.4956
8,0.7818,0.8384,0.8638,0.8087,0.8353,0.513,0.5155
9,0.7767,0.843,0.847,0.8125,0.8294,0.5068,0.5077


### Let's load in our 2021 season data and get retention scores using the model:

In [8]:
df_inference = df.loc[df["year"] >= 2021]
df_inference = df_inference.fillna(0)
df_inference.shape

(5596, 56)

In [9]:
new_predictions = predict_model(final_model, data=df_inference, raw_score=True)
new_predictions.head()

Unnamed: 0,lkupClientId,dimCustomerMasterId,customerNumber,year,productGrouping,totalSpent,recentDate,attendancePercent,renewedBeforeDays,isBuyer,...,isnextyear_buyer,isnextyear_samepkg_buyer,pkgupgrade_status,auto_renewal,credits_after_refund,is_Lockdown,isNextYear_Buyer,Label,Score_0,Score_1
21891,5,298267187,211139709,2021,Half Season,104.0,1970-01-01,0.0,251,True,...,0,0,0,0,0.0,1,0,0,0.9859,0.0141
21892,5,298495769,201019586,2021,Full Season,7482.0,2021-11-23,1.090909,119,True,...,0,0,0,0,0.0,1,0,1,0.42,0.58
21893,5,299088720,207050629,2021,Quarter Season,1500.0,2021-10-27,1.0,99,True,...,0,0,0,0,0.0,1,0,0,0.6829,0.3171
21894,5,299091531,921909,2021,Full Season,6501.6,2021-11-23,0.590909,270,True,...,0,0,0,0,0.0,1,0,1,0.0946,0.9054
21895,5,299111838,925839,2021,Full Season,6908.38,2021-11-23,0.818182,265,True,...,0,0,0,0,0.0,1,0,1,0.1246,0.8754


In [48]:
#new_predictions = new_predictions.loc[new_predictions["productGrouping"] == "Full Season"]

In [None]:
new_predictions[new_predictions["Label"]==1][["Score_1"]].hist(bins=30, figsize=(10,5), range=[0,1])

### Let's compare this distribution to the scores in SageMaker:

In [None]:
#df_sagemaker_scores = pd.read_json('inference_data.json', lines=True)
#df_sagemaker_scores.head()

In [None]:
df_sagemaker_scores[df_sagemaker_scores["lkupclientid"]==55][["isNextYear_Buyer"]].hist()