In [5]:
import getpass
import pyodbc
import pandas as pd
import warnings
import matplotlib.pyplot as plt

from pycaret.classification import *

warnings.filterwarnings('ignore')

In [6]:
# connect to SQL Server.
SERVER = '52.44.171.130' 
DATABASE = 'datascience' 
USERNAME = 'dsAdminWrite' 
PASSWORD = getpass.getpass(prompt='Enter your password')
CNXN = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+SERVER+';DATABASE='+DATABASE+';UID='+USERNAME+';PWD='+ PASSWORD)

Enter your password ····················


In [18]:
lkupclientid = 36 # flames

cursor = CNXN.cursor()

storedProc = (
    f"""Exec [stlrFlames].[ds].[getRetentionScoringModelData] {lkupclientid}"""
)

df = pd.read_sql(storedProc, CNXN)

df["lkupclientid"] = 36
df["clientcode"] = 'flames'
    
df = df.fillna(0)

CNXN.commit()
cursor.close()

df.shape

(38823, 57)

In [19]:
# choose the features for the stellar base retention model
features = [
    "dimCustomerMasterId",
    "attendancePercent",
    "clientcode",
    "distToVenue",
    "inperson_contact",
    "isNextYear_Buyer",
    "lkupclientid",
    "missed_games_1",
    "missed_games_2",
    "missed_games_over_2",
    "productGrouping",
    "recentDate",
    "recency",
    "source_tenure",
    "totalSpent",
    "year"
]

# choose the features & train year & test year
df = df[features]
df["year"] = pd.to_numeric(df["year"])

df = df.loc[df["year"] <= 2019]
    
# create training & eval dataset
df_train = df.sample(frac=0.75, random_state=786).reset_index(drop=True)
df_eval = df.drop(df_train.index).reset_index(drop=True)

# print out the number of records for training and eval
print('Data for Modeling: ' + str(df_train.shape))
print('Unseen Data For Predictions: ' + str(df_eval.shape), end="\n\n")

Data for Modeling: (25211, 16)
Unseen Data For Predictions: (8404, 16)



In [21]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25211 entries, 0 to 25210
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dimCustomerMasterId  25211 non-null  int64  
 1   attendancePercent    25211 non-null  float64
 2   clientcode           25211 non-null  object 
 3   distToVenue          25211 non-null  float64
 4   inperson_contact     25211 non-null  int64  
 5   isNextYear_Buyer     25211 non-null  int64  
 6   lkupclientid         25211 non-null  int64  
 7   missed_games_1       25211 non-null  int64  
 8   missed_games_2       25211 non-null  int64  
 9   missed_games_over_2  25211 non-null  int64  
 10  productGrouping      25211 non-null  object 
 11  recentDate           25211 non-null  object 
 12  recency              25211 non-null  int64  
 13  source_tenure        25211 non-null  int64  
 14  totalSpent           25211 non-null  float64
 15  year                 25211 non-null 

In [22]:
setup(
    data= df_train, 
    target="isNextYear_Buyer", 
    train_size = 0.85,
    data_split_shuffle=True,
    ignore_features=["dimCustomerMasterId","productGrouping","year"],
    silent=True,
    verbose=False,
    date_features=["recentDate"],
    numeric_features=[
        "attendancePercent",
        "distToVenue",
        "missed_games_1",
        "missed_games_2",
        "missed_games_over_2",
        "recency",
        "source_tenure",
        "totalSpent"
    ]
);

In [23]:
model_matrix = compare_models(
    fold=10,
    include=["lr"]
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8623,0.8887,0.9121,0.8927,0.9022,0.6697,0.6705,0.418


In [24]:
model_matrix

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=346, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
best_model = create_model(model_matrix)
final_model = finalize_model(best_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8516,0.8915,0.8975,0.8904,0.8939,0.647,0.6471
1,0.8563,0.8898,0.9143,0.8835,0.8986,0.6521,0.6532
2,0.8838,0.8964,0.9303,0.9055,0.9177,0.7201,0.7209
3,0.8493,0.8883,0.8929,0.8911,0.892,0.6426,0.6426
4,0.8978,0.9129,0.9438,0.9126,0.9279,0.7525,0.7537
5,0.8651,0.8897,0.921,0.8895,0.905,0.6731,0.6743
6,0.8395,0.8748,0.8802,0.8885,0.8843,0.6222,0.6222
7,0.8791,0.8829,0.9357,0.8956,0.9152,0.7052,0.7072
8,0.8437,0.8742,0.8929,0.884,0.8884,0.6274,0.6274
9,0.8571,0.8865,0.9123,0.8861,0.899,0.6552,0.656


In [30]:
eval_data = predict_model(final_model, data=df_eval, raw_score=True)
eval_data.head()

Unnamed: 0,dimCustomerMasterId,attendancePercent,clientcode,distToVenue,inperson_contact,isNextYear_Buyer,lkupclientid,missed_games_1,missed_games_2,missed_games_over_2,productGrouping,recentDate,recency,source_tenure,totalSpent,year,Label,Score_0,Score_1
0,1567894,0.875,flames,11.74,0,1,36,4,1,0,Full Season,2019-04-06,0,5840,6287.22,2018,1,0.0476,0.9524
1,1569913,0.95,flames,1.65,0,1,36,4,1,0,Full Season,2019-04-06,0,7300,36409.52,2018,1,0.0503,0.9497
2,1570013,0.883333,flames,1.65,0,1,36,3,3,2,Full Season,2019-04-06,0,10220,26602.58,2018,1,0.0224,0.9776
3,1571348,1.0125,flames,10.39,0,1,36,1,0,0,Full Season,2019-04-06,0,6205,12572.68,2018,1,0.043,0.957
4,1572901,0.9125,flames,23.42,0,1,36,4,1,0,Full Season,2019-04-06,0,3285,2231.16,2018,1,0.0548,0.9452


In [32]:
accuracy = pycaret.utils.check_metric(
    eval_data["isNextYear_Buyer"], eval_data["Label"], metric="Accuracy"
)
precision = pycaret.utils.check_metric(
    eval_data["isNextYear_Buyer"], eval_data["Label"], metric="Precision"
)
recall = pycaret.utils.check_metric(
    eval_data["isNextYear_Buyer"], eval_data["Label"], metric="Recall"
)
f1 = pycaret.utils.check_metric(
    eval_data["isNextYear_Buyer"], eval_data["Label"], metric="F1"
)
auc = pycaret.utils.check_metric(
    eval_data["isNextYear_Buyer"], eval_data["Label"], metric="AUC"
)

# calculate model metrics
results = {
    "binary_classification_metrics": {
        "Accuracy": {"value": accuracy},
        "Precision": {"value": precision},
        "Recall": {"value": recall},
        "F1": {"value": f1},
        "AUC": {"value": auc},
    }
}

results

{'binary_classification_metrics': {'Accuracy': {'value': 0.9267},
  'Precision': {'value': 0.7841},
  'Recall': {'value': 0.8861},
  'F1': {'value': 0.832},
  'AUC': {'value': 0.9116}}}

In [38]:
lkupclientid = 36 # flames

cursor = CNXN.cursor()

storedProc = (
    f"""Exec [stlrFlames].[ds].[getRetentionScoringModelData] {lkupclientid}"""
)

df = pd.read_sql(storedProc, CNXN)

df["lkupclientid"] = 36
df["clientcode"] = 'flames'
    
df = df.fillna(0)

CNXN.commit()
cursor.close()

df.shape

(38823, 57)

In [39]:
# choose the features for the stellar base retention model
features = [
    "dimCustomerMasterId",
    "attendancePercent",
    "clientcode",
    "distToVenue",
    "inperson_contact",
    "isNextYear_Buyer",
    "lkupclientid",
    "missed_games_1",
    "missed_games_2",
    "missed_games_over_2",
    "productGrouping",
    "recentDate",
    "recency",
    "source_tenure",
    "totalSpent",
    "year"
]

# choose the features
df = df[features]

In [40]:
df_inference = df

df_inference["year"] = pd.to_numeric(df_inference["year"])
df_inference = df_inference.loc[df_inference["year"] >= 2021]

df_inference.shape

(5208, 16)

In [44]:
df_scores = predict_model(final_model, data=df_inference, raw_score=True)
df_scores.head()

Unnamed: 0,dimCustomerMasterId,attendancePercent,clientcode,distToVenue,inperson_contact,isNextYear_Buyer,lkupclientid,missed_games_1,missed_games_2,missed_games_over_2,productGrouping,recentDate,recency,source_tenure,totalSpent,year,Label,Score_0,Score_1
33615,35,0.55,flames,216.77,0,0,36,5,0,0,Full Season,2021-11-29,1,1825,39160.56,2021,0,0.5811,0.4189
33616,90,1.1,flames,23.58,0,0,36,0,0,0,Full Season,2021-12-11,0,3650,12341.96,2021,1,0.3228,0.6772
33617,117,0.7,flames,13.23,0,0,36,0,0,1,Half Season,2021-11-29,0,730,6837.92,2021,1,0.4182,0.5818
33618,180,1.0,flames,14.69,0,0,36,0,0,0,Full Season,2021-12-09,0,13870,4170.7,2021,1,0.2161,0.7839
33619,198,0.9,flames,7.03,0,0,36,2,0,0,Half Season,2021-12-11,0,4745,4170.7,2021,1,0.3369,0.6631


In [47]:
customer = df_scores.loc[df_scores['dimCustomerMasterId'] == 2092]
customer

Unnamed: 0,dimCustomerMasterId,attendancePercent,clientcode,distToVenue,inperson_contact,isNextYear_Buyer,lkupclientid,missed_games_1,missed_games_2,missed_games_over_2,productGrouping,recentDate,recency,source_tenure,totalSpent,year,Label,Score_0,Score_1
33624,2092,0.55,flames,8.27,0,0,36,2,0,1,Full Season,2021-12-11,0,5110,4170.7,2021,1,0.3135,0.6865
