# MILB-League-Based-Retention-Model

* Stellar Algo
* Nakiska Rad & Ryan Kazmerik
* August 17, 2021

In [2]:
import getpass
import psycopg2
import pandas as pd
import warnings

from pycaret.classification import *

warnings.filterwarnings('ignore')

## Let's get our dataset from RedShift. We will select the full season product only and seasons before 2020, since 2020 was a strange year due to covid, and 2021 does not have any repurchasing data available

In [3]:
# connecting to Redshift
conn = psycopg2.connect(
    dbname = 'datascience',
    host = 'sagemaker.cbpdnejrkweo.us-east-1.redshift.amazonaws.com',
    port = 5439,
    user = 'xerris',
    password='ThisIsDataScience20!!',
    sslmode='require'
)

product_grouping = "Full Season" 
max_season_year = 2019

cur = conn.cursor()

sample_query = f'''
    SELECT 
        r.dimcustomermasterid,
        recency,
        attendancePercent,
        totalSpent,
        distToVenue,
        source_tenure,
        renewedBeforeDays,
        missed_games_1,
        missed_games_2,
        missed_games_over_2,
        isnextyear_buyer
    FROM 
        ds.retentionscoring r 
    WHERE 
        lkupclientid in(9,11,12,15,17,19,20,21,24,25,26,27,28,30,34,40,43,44,46,47,48,49,51,59) 
    AND productgrouping in({"'"+ str(product_grouping) + "'"}) 
    AND year < {max_season_year};
'''

b = cur.execute(sample_query)
p = cur.fetchall()

df = pd.DataFrame(p)

new_columns = ['dimcustomermasterid','recency','attendancePercent','totalSpent','distToVenue','source_tenure','renewedBeforeDays','missed_games_1','missed_games_2','missed_games_over_2','isnextyear_buyer']

df = pd.DataFrame(p,columns=new_columns)

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18319 entries, 0 to 18318
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dimcustomermasterid  18319 non-null  object
 1   recency              18319 non-null  int64 
 2   attendancePercent    18319 non-null  object
 3   totalSpent           18319 non-null  object
 4   distToVenue          18319 non-null  object
 5   source_tenure        18319 non-null  int64 
 6   renewedBeforeDays    18319 non-null  int64 
 7   missed_games_1       18319 non-null  int64 
 8   missed_games_2       18319 non-null  int64 
 9   missed_games_over_2  18319 non-null  int64 
 10  isnextyear_buyer     18319 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 1.5+ MB


Unnamed: 0,dimcustomermasterid,recency,attendancePercent,totalSpent,distToVenue,source_tenure,renewedBeforeDays,missed_games_1,missed_games_2,missed_games_over_2,isnextyear_buyer
0,272818801,0,0.255814,5848.0,0.87,1460,209,0,0,3,1
1,272826561,9,0.144366,2485.0,78.2,1095,212,0,0,5,1
2,272836614,7,0.0819672,976.0,5.74,559,196,0,0,6,0
3,272856776,0,0.211806,2520.0,0.87,414,55,0,0,6,0
4,272877256,0,0.374016,2222.5,5.63,149,5,4,0,8,0


## We need to hold back some samples from the dataset for evalution, so let's create the evaluation dataset

In [4]:
data = df.sample(frac=0.95, random_state=786)
data_unseen = df.drop(data.index)

data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (17403, 11)
Unseen Data For Predictions: (916, 11)


## Now we can model the data using PyCaret, with a binary classification prediction for the isnextyear_buyer field to see how likely a customer is to re-purchase

In [5]:
model = setup(data, target='isnextyear_buyer', train_size = 0.8)

Unnamed: 0,Description,Value
0,session_id,685
1,Target,isnextyear_buyer
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(17403, 11)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,5
8,Ordinal Features,False
9,High Cardinality Features,False


In [6]:
model_matrix = compare_models(fold=3, exclude=['qda'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8071,0.8415,0.9223,0.8273,0.8722,0.4832,0.4953,1.8567
gbc,Gradient Boosting Classifier,0.8008,0.8217,0.9477,0.807,0.8717,0.439,0.4691,28.41
rf,Random Forest Classifier,0.7933,0.8055,0.9525,0.7975,0.8681,0.4078,0.4445,7.5533
ada,Ada Boost Classifier,0.7877,0.7992,0.9312,0.803,0.8623,0.4098,0.4321,10.76
et,Extra Trees Classifier,0.7839,0.7857,0.9398,0.7949,0.8613,0.3872,0.4166,10.5467
ridge,Ridge Classifier,0.7748,0.0,0.8773,0.8199,0.8476,0.4182,0.4218,7.11
dt,Decision Tree Classifier,0.7591,0.6879,0.8542,0.8168,0.8351,0.3891,0.3905,3.83
knn,K Neighbors Classifier,0.7505,0.7371,0.8857,0.7903,0.8352,0.3279,0.3373,7.83
lda,Linear Discriminant Analysis,0.7182,0.7203,0.7676,0.8256,0.7949,0.3456,0.3489,206.0633
lr,Logistic Regression,0.714,0.4702,1.0,0.714,0.8331,0.0,0.0,5.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=True).T
  overwrite_a=True).T
