# Inter-Maimi - PROD Retention Score _ all teams
* StelllarAlgo Data Science
* Ryan Kazmerik & Nakisa Rad
* June 20, 2022

## Hypothesis


## Experiment
This section details our experiment including querying data, data transformations, feature selection and modelling.

In [11]:
import getpass
import pyodbc
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from pycaret.classification import *

### Let's connect to MSSQL and run a stored proc to get our dataset:

In [12]:
# connect to SQL Server.
SERVER = '34.206.73.189' 
DATABASE = 'datascience' 
USERNAME = 'dsAdminWrite' 
PASSWORD = getpass.getpass(prompt='Enter your password')
CNXN = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+SERVER+';DATABASE='+DATABASE+';UID='+USERNAME+';PWD='+ PASSWORD)

Enter your password ··················


In [3]:
cursor = CNXN.cursor()

storedProc = (
    f"""Exec [stlrMLS].[ds].[getRetentionScoringModelData_all_teams] """
)

df = pd.read_sql(storedProc, CNXN)

# apply some data transformations
df["year"] = pd.to_numeric(df["year"])

CNXN.commit()
cursor.close()

df.shape

(59989, 52)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59989 entries, 0 to 59988
Data columns (total 52 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   dimCustomerMasterId     59989 non-null  int64  
 1   year                    59989 non-null  int64  
 2   productGrouping         59989 non-null  object 
 3   totalSpent              59989 non-null  float64
 4   recentDate              59989 non-null  object 
 5   attendancePercent       59989 non-null  float64
 6   renewedBeforeDays       59968 non-null  float64
 7   isBuyer                 59989 non-null  object 
 8   source_tenure           59968 non-null  float64
 9   tenure                  59968 non-null  float64
 10  distToVenue             59989 non-null  float64
 11  totalGames              59989 non-null  int64  
 12  recency                 59989 non-null  int64  
 13  missed_games_1          59989 non-null  int64  
 14  missed_games_2          59989 non-null

### We should specify the features used in our model:

In [5]:
# choose the features for the stellar base retention model
features = [
            "dimCustomerMasterId",
            "year",
            "productGrouping", 
            "totalSpent", 
            "recentDate",
            "attendancePercent", 
            "renewedBeforeDays",
            "source_tenure",
            "tenure",
            "distToVenue",
            "recency",
            "missed_games_1",
            "missed_games_2",
            "missed_games_over_2",
            "isNextYear_Buyer"
]

# copy your main dataframe
df_dataset = df

# choose the features & train year & test year
df_dataset = df_dataset[features]
df_dataset["year"] = pd.to_numeric(df_dataset["year"])
df_dataset = df_dataset.loc[df_dataset["year"] <= 2021]

df_train = df_dataset.sample(frac=0.85, random_state=786)
df_eval = df_dataset.drop(df_train.index)

df_train.reset_index(drop=True, inplace=True)
df_eval.reset_index(drop=True, inplace=True)

# print out the number of records for training and eval
print('Data for Modeling: ' + str(df_train.shape))
print('Unseen Data For Predictions: ' + str(df_eval.shape), end="\n\n")

Data for Modeling: (43284, 15)
Unseen Data For Predictions: (7638, 15)



### Now we can model the data using a binary classification prediction for the isnextyear_buyer field to see how likely a customer is to re-purchase.

In [6]:
setup(
    data= df_train, 
    target="isNextYear_Buyer", 
    train_size = 0.85,
    data_split_shuffle=True,
    ignore_features=["dimCustomerMasterId","productGrouping","year"],
    silent=True,
    verbose=False,
    numeric_features=[
    "totalSpent", 
            "attendancePercent", 
            "renewedBeforeDays",
            "source_tenure",
            "tenure",
            "distToVenue",
            "recency",
            "missed_games_1",
            "missed_games_2",
            "missed_games_over_2"
    ]
);

In [7]:
model_matrix = compare_models(
    fold=10,
    include=["lightgbm","lr","xgboost"]
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7789,0.8268,0.8971,0.7861,0.8379,0.495,0.5059,0.378
xgboost,Extreme Gradient Boosting,0.7765,0.8241,0.8868,0.7887,0.8349,0.4929,0.5011,1.689
lr,Logistic Regression,0.7352,0.7572,0.888,0.7453,0.8103,0.3826,0.3995,1.002


In [8]:
lightgbm_model = create_model('lightgbm')
lr_model = create_model('lr')
xgboost_model = create_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7799,0.8273,0.8852,0.7932,0.8367,0.5023,0.5095
1,0.7668,0.8214,0.8788,0.782,0.8276,0.4711,0.4788
2,0.7752,0.825,0.8856,0.7878,0.8338,0.4901,0.4982
3,0.7728,0.8194,0.8873,0.7842,0.8326,0.4831,0.4921
4,0.7782,0.8251,0.8835,0.7923,0.8354,0.4985,0.5055
5,0.785,0.8311,0.8959,0.7934,0.8415,0.5111,0.5203
6,0.7711,0.8153,0.8788,0.7869,0.8303,0.4822,0.4892
7,0.7744,0.8249,0.8882,0.7857,0.8338,0.4868,0.4957
8,0.7869,0.8309,0.8963,0.7952,0.8428,0.5159,0.5248
9,0.7749,0.8202,0.8882,0.7863,0.8341,0.4882,0.497


### Let's load in our 2021 season data and get retention scores using the model:

In [None]:
cursor = CNXN.cursor()

storedProc_scoring = (
    f"""Exec [stlrMLS].[ds].[getRetentionScoringModelData_staging2] """
)

df_scoring = pd.read_sql(storedProc_scoring, CNXN)

# apply some data transformations
df_scoring["year"] = pd.to_numeric(df_scoring["year"])

CNXN.commit()
cursor.close()

df_scoring.shape

In [None]:
df_inference = df_scoring.loc[df_scoring["year"] >= 2022]
df_inference = df_inference.fillna(0)
df_inference.shape

In [None]:
lightgbm_predictions = predict_model(lightgbm_model, data=df_inference, raw_score=True)
lr_predictions = predict_model(lr_model, data=df_inference, raw_score=True)
xgboost_predictions = predict_model(xgboost_model, data=df_inference, raw_score=True)

In [None]:
print(f"lightgbm:\n {lightgbm_predictions.Label.value_counts()}")
print(f"log reg:\n {lr_predictions.Label.value_counts()}")
print(f"xgboost:\n {xgboost_predictions.Label.value_counts()}")

In [None]:
print(f"lightgbm:\n {lightgbm_predictions.Score_1.value_counts(bins=[0, 0.25, 0.5, 0.75, 1])}")
print(f"lr:\n {lr_predictions.Score_1.value_counts(bins=[0, 0.25, 0.5, 0.75, 1])}")
print(f"xgboost:\n {xgboost_predictions.Score_1.value_counts(bins=[0, 0.25, 0.5, 0.75, 1])}")

In [None]:
model_predictions = [lightgbm_predictions, lr_predictions, xgboost_predictions]

for idx, df_predictions in enumerate(model_predictions):

    plt.subplot(1, 3, idx+1)

    plt.hist(df_predictions["Score_1"], bins=20, edgecolor='black')
    plt.title("Retention Scores")
    plt.ylabel("Num Fans")
    plt.xlabel("SA Buyer Score")

plt.rcParams["figure.figsize"] = (20,4)
plt.show()

In [None]:
plot_model(lightgbm_model, plot='feature')

In [None]:
plot_model(lr_model, plot='feature')

In [None]:
plot_model(xgboost_model, plot='feature')

In [None]:
plot_model(lightgbm_model, plot='confusion_matrix')

In [None]:
plot_model(lr_model, plot='confusion_matrix')

In [None]:
plot_model(xgboost_model, plot='confusion_matrix')

In [None]:
lightgbm_predictions

In [None]:

current_date = datetime.today().strftime('%Y-%m-%d')
lightgbm_predictions.to_csv(f"Results/InterMaimi_Scores_{current_date}.csv", index=False)

## Observations
Here you can document some ideas on the results from above