# MILB League Based Retention Model (QA)

* Stellar Algo
* Nakiska Rad & Ryan Kazmerik
* August 31, 2021

In [5]:
import getpass
import pyodbc
import pandas as pd

from pycaret.classification import *
from pycaret.utils import check_metric
from shared_utilities import helpers

## Let's get our dataset from MSSQL. We will select the full season product only and seasons before 2020, since 2020 was a strange year due to covid, and 2021 does not have any repurchasing data available

In [6]:
CNXN = helpers.get_mssql_connection("QA")

NoCredentialsError: Unable to locate credentials

In [3]:
cursor = CNXN.cursor()

product = "Full Season"
train_season_year = 2019

query =  f"""
    SELECT 
        r.dimcustomermasterid,
        recency,
        attendancePercent,
        totalSpent,
        distToVenue,
        source_tenure,
        renewedBeforeDays,
        missed_games_1,
        missed_games_2,
        missed_games_over_2,
        isnextyear_buyer
    FROM 
        ds.retentionscoring r 
    WHERE 
        lkupclientid in(9,11,12,15,17,19,20,21,24,25,26,27,28,30,34,40,43,44,46,47,48,49,51,59) 
    AND 
        productgrouping = {"'"+ str(product) + "'"} 
    AND 
        year < {train_season_year};
    """
    
df = pd.read_sql(query, CNXN)
    
CNXN.commit()
cursor.close()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16752 entries, 0 to 16751
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dimcustomermasterid  16752 non-null  object 
 1   recency              16752 non-null  int64  
 2   attendancePercent    16752 non-null  object 
 3   totalSpent           16752 non-null  object 
 4   distToVenue          16752 non-null  object 
 5   source_tenure        16752 non-null  int64  
 6   renewedBeforeDays    16752 non-null  int64  
 7   missed_games_1       16752 non-null  int64  
 8   missed_games_2       16752 non-null  int64  
 9   missed_games_over_2  16752 non-null  int64  
 10  isnextyear_buyer     16533 non-null  float64
dtypes: float64(1), int64(6), object(4)
memory usage: 1.4+ MB


## We have some missing values for our target variable (isnextyear_buyer) so let's fill in those missing values

In [4]:
df['isnextyear_buyer'] = df['isnextyear_buyer'].fillna(0.0)
df['isnextyear_buyer'] = df['isnextyear_buyer'].round(0).astype(int)
                            
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16752 entries, 0 to 16751
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dimcustomermasterid  16752 non-null  object
 1   recency              16752 non-null  int64 
 2   attendancePercent    16752 non-null  object
 3   totalSpent           16752 non-null  object
 4   distToVenue          16752 non-null  object
 5   source_tenure        16752 non-null  int64 
 6   renewedBeforeDays    16752 non-null  int64 
 7   missed_games_1       16752 non-null  int64 
 8   missed_games_2       16752 non-null  int64 
 9   missed_games_over_2  16752 non-null  int64 
 10  isnextyear_buyer     16752 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 1.4+ MB


Unnamed: 0,dimcustomermasterid,recency,attendancePercent,totalSpent,distToVenue,source_tenure,renewedBeforeDays,missed_games_1,missed_games_2,missed_games_over_2,isnextyear_buyer
0,303661788,0,0.580986,2485.0,26.49,1460,8,8,8,3,1
1,303663474,1,0.527778,4032.0,23.92,1460,198,0,2,7,1
2,303659570,0,0.548611,1641.6,24.07,1460,185,14,4,3,1
3,303658983,3,0.313869,1561.8,18.21,1460,197,2,2,3,1
4,303659912,0,0.280992,1379.4,5.16,1460,202,0,0,4,1


## Let's check the balance of our target variable to understand what performance metric will be the best measure of model performance

In [5]:
df['isnextyear_buyer'].value_counts()

1    11288
0     5464
Name: isnextyear_buyer, dtype: int64

## We need to hold back some samples from the dataset for evalution, so let's create the training and evaluation dataset with a hold-back of 10%.

In [6]:
df_train = df.sample(frac=0.90, random_state=786).reset_index(drop=True)
df_eval = df.drop(df_train.index).reset_index(drop=True)

print('Data for Modeling: ' + str(df_train.shape))
print('Unseen Data For Predictions: ' + str(df_eval.shape))

Data for Modeling: (15077, 11)
Unseen Data For Predictions: (1675, 11)


## Now we can model the data using PyCaret, with a binary classification prediction for the isnextyear_buyer field to see how likely a customer is to re-purchase.

In [7]:
model = setup(
    data= df_train, 
    target='isnextyear_buyer', 
    train_size = 0.80,
    data_split_shuffle=True,
    silent=True,
    ignore_features=['dimcustomermasterid'],
    numeric_features=[
        "attendancePercent",
        "distToVenue",
        "missed_games_1",
        "missed_games_2",
        "missed_games_over_2",
        "recency",
        "renewedBeforeDays",
        "source_tenure",
        "totalSpent"
    ]
)

Unnamed: 0,Description,Value
0,session_id,5921
1,Target,isnextyear_buyer
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(15077, 11)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


### Let's compare all models to evaluate performance. This function trains all models in the model library and scores them using k-fold cross validation for metric evaluation. The output prints a score grid that shows average Accuracy, AUC, Recall, Precision, F1 and Kappa accross the folds (10 by default) along with training time.

In [11]:
model_matrix = compare_models(
    fold=10,
    include=["ada","dt","gbc","et","knn","lightgbm","lr","rf","ridge","xgboost"]
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
5,Light Gradient Boosting Machine,0.778,0.8068,0.9198,0.7886,0.8491,0.4389,0.4581,0.03
10,Extreme Gradient Boosting,0.7712,0.7978,0.898,0.7927,0.842,0.4332,0.4446,0.431
2,Gradient Boosting Classifier,0.7698,0.7996,0.9348,0.7735,0.8465,0.4027,0.4332,0.15
7,Random Forest Classifier,0.7669,0.7935,0.9084,0.7831,0.8411,0.413,0.4295,0.263
9,NGBClassifier,0.7621,0.782,0.9441,0.7623,0.8435,0.3707,0.4104,2.035
3,Extra Trees Classifier,0.7578,0.7687,0.9185,0.7695,0.8374,0.3772,0.4009,0.193
0,Ada Boost Classifier,0.7541,0.7765,0.9202,0.7652,0.8356,0.3641,0.3896,0.171
4,K Neighbors Classifier,0.7471,0.7466,0.8728,0.7807,0.8241,0.379,0.3868,0.035
1,Decision Tree Classifier,0.7076,0.6689,0.7766,0.7898,0.7829,0.3347,0.3353,0.015
6,Logistic Regression,0.6801,0.66,0.9917,0.6818,0.808,0.0179,0.0576,0.027


[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5252 val_loss=0.0000 scale=2.0000 norm=3.7612
[iter 200] loss=0.5043 val_loss=0.0000 scale=1.0000 norm=1.8844
[iter 300] loss=0.4959 val_loss=0.0000 scale=1.0000 norm=1.8893
[iter 400] loss=0.4916 val_loss=0.0000 scale=1.0000 norm=1.8925
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5223 val_loss=0.0000 scale=2.0000 norm=3.7558
[iter 200] loss=0.5022 val_loss=0.0000 scale=1.0000 norm=1.8838
[iter 300] loss=0.4943 val_loss=0.0000 scale=2.0000 norm=3.7811
[iter 400] loss=0.4888 val_loss=0.0000 scale=1.0000 norm=1.8933
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5249 val_loss=0.0000 scale=2.0000 norm=3.7622
[iter 200] loss=0.5031 val_loss=0.0000 scale=1.0000 norm=1.8837
[iter 300] loss=0.4949 val_loss=0.0000 scale=1.0000 norm=1.8877
[iter 400] loss=0.4902 val_loss=0.0000 scale=1.0000 norm=1.8917
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0

  overwrite_a=True).T


[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5228 val_loss=0.0000 scale=2.0000 norm=3.7594
[iter 200] loss=0.5029 val_loss=0.0000 scale=1.0000 norm=1.8820
[iter 300] loss=0.4959 val_loss=0.0000 scale=1.0000 norm=1.8887
[iter 400] loss=0.4917 val_loss=0.0000 scale=1.0000 norm=1.8920
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5249 val_loss=0.0000 scale=2.0000 norm=3.7622
[iter 200] loss=0.5031 val_loss=0.0000 scale=1.0000 norm=1.8837
[iter 300] loss=0.4949 val_loss=0.0000 scale=1.0000 norm=1.8877
[iter 400] loss=0.4902 val_loss=0.0000 scale=1.0000 norm=1.8917
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5252 val_loss=0.0000 scale=2.0000 norm=3.7612
[iter 200] loss=0.5043 val_loss=0.0000 scale=1.0000 norm=1.8844
[iter 300] loss=0.4959 val_loss=0.0000 scale=1.0000 norm=1.8893
[iter 400] loss=0.4916 val_loss=0.0000 scale=1.0000 norm=1.8925


  overwrite_a=True).T


[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5234 val_loss=0.0000 scale=2.0000 norm=3.7562
[iter 200] loss=0.5050 val_loss=0.0000 scale=1.0000 norm=1.8803
[iter 300] loss=0.4980 val_loss=0.0000 scale=1.0000 norm=1.8832
[iter 400] loss=0.4942 val_loss=0.0000 scale=1.0000 norm=1.8870
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5241 val_loss=0.0000 scale=2.0000 norm=3.7578
[iter 200] loss=0.5040 val_loss=0.0000 scale=1.0000 norm=1.8845
[iter 300] loss=0.4958 val_loss=0.0000 scale=2.0000 norm=3.7805
[iter 400] loss=0.4920 val_loss=0.0000 scale=1.0000 norm=1.8938
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5234 val_loss=0.0000 scale=2.0000 norm=3.7562
[iter 200] loss=0.5050 val_loss=0.0000 scale=1.0000 norm=1.8803
[iter 300] loss=0.4980 val_loss=0.0000 scale=1.0000 norm=1.8832
[iter 400] loss=0.4942 val_loss=0.0000 scale=1.0000 norm=1.8870
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0

  overwrite_a=True).T


[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5249 val_loss=0.0000 scale=2.0000 norm=3.7622
[iter 200] loss=0.5031 val_loss=0.0000 scale=1.0000 norm=1.8837
[iter 300] loss=0.4949 val_loss=0.0000 scale=1.0000 norm=1.8877
[iter 400] loss=0.4902 val_loss=0.0000 scale=1.0000 norm=1.8917
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5234 val_loss=0.0000 scale=2.0000 norm=3.7562
[iter 200] loss=0.5050 val_loss=0.0000 scale=1.0000 norm=1.8803
[iter 300] loss=0.4980 val_loss=0.0000 scale=1.0000 norm=1.8832
[iter 400] loss=0.4942 val_loss=0.0000 scale=1.0000 norm=1.8870
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5223 val_loss=0.0000 scale=2.0000 norm=3.7558
[iter 200] loss=0.5022 val_loss=0.0000 scale=1.0000 norm=1.8838
[iter 300] loss=0.4943 val_loss=0.0000 scale=2.0000 norm=3.7811
[iter 400] loss=0.4888 val_loss=0.0000 scale=1.0000 norm=1.8933
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0



[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5240 val_loss=0.0000 scale=2.0000 norm=3.7602
[iter 200] loss=0.5044 val_loss=0.0000 scale=1.0000 norm=1.8815
[iter 300] loss=0.4976 val_loss=0.0000 scale=1.0000 norm=1.8865
[iter 400] loss=0.4935 val_loss=0.0000 scale=1.0000 norm=1.8914
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5240 val_loss=0.0000 scale=2.0000 norm=3.7602
[iter 200] loss=0.5044 val_loss=0.0000 scale=1.0000 norm=1.8815
[iter 300] loss=0.4976 val_loss=0.0000 scale=1.0000 norm=1.8865
[iter 400] loss=0.4935 val_loss=0.0000 scale=1.0000 norm=1.8914
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5240 val_loss=0.0000 scale=2.0000 norm=3.7602
[iter 200] loss=0.5044 val_loss=0.0000 scale=1.0000 norm=1.8815
[iter 300] loss=0.4976 val_loss=0.0000 scale=1.0000 norm=1.8865
[iter 400] loss=0.4935 val_loss=0.0000 scale=1.0000 norm=1.8914
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0

  overwrite_a=True).T
  overwrite_a=True).T


[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5254 val_loss=0.0000 scale=2.0000 norm=3.7598
[iter 200] loss=0.5049 val_loss=0.0000 scale=1.0000 norm=1.8842
[iter 300] loss=0.4957 val_loss=0.0000 scale=2.0000 norm=3.7778
[iter 400] loss=0.4901 val_loss=0.0000 scale=1.0000 norm=1.8949
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5216 val_loss=0.0000 scale=2.0000 norm=3.7493
[iter 200] loss=0.5008 val_loss=0.0000 scale=2.0000 norm=3.7547
[iter 300] loss=0.4922 val_loss=0.0000 scale=1.0000 norm=1.8835
[iter 400] loss=0.4884 val_loss=0.0000 scale=2.0000 norm=3.7750
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5234 val_loss=0.0000 scale=2.0000 norm=3.7562
[iter 200] loss=0.5050 val_loss=0.0000 scale=1.0000 norm=1.8803
[iter 300] loss=0.4980 val_loss=0.0000 scale=1.0000 norm=1.8832
[iter 400] loss=0.4942 val_loss=0.0000 scale=1.0000 norm=1.8870


  overwrite_a=True).T
  overwrite_a=True).T


[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5228 val_loss=0.0000 scale=2.0000 norm=3.7594
[iter 200] loss=0.5029 val_loss=0.0000 scale=1.0000 norm=1.8820
[iter 300] loss=0.4959 val_loss=0.0000 scale=1.0000 norm=1.8887
[iter 400] loss=0.4917 val_loss=0.0000 scale=1.0000 norm=1.8920
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5246 val_loss=0.0000 scale=2.0000 norm=3.7632
[iter 200] loss=0.5040 val_loss=0.0000 scale=1.0000 norm=1.8848
[iter 300] loss=0.4968 val_loss=0.0000 scale=1.0000 norm=1.8899
[iter 400] loss=0.4920 val_loss=0.0000 scale=1.0000 norm=1.8930
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5228 val_loss=0.0000 scale=2.0000 norm=3.7594
[iter 200] loss=0.5029 val_loss=0.0000 scale=1.0000 norm=1.8820
[iter 300] loss=0.4959 val_loss=0.0000 scale=1.0000 norm=1.8887
[iter 400] loss=0.4917 val_loss=0.0000 scale=1.0000 norm=1.8920


  overwrite_a=True).T


[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5254 val_loss=0.0000 scale=2.0000 norm=3.7598
[iter 200] loss=0.5049 val_loss=0.0000 scale=1.0000 norm=1.8842
[iter 300] loss=0.4957 val_loss=0.0000 scale=2.0000 norm=3.7778
[iter 400] loss=0.4901 val_loss=0.0000 scale=1.0000 norm=1.8949
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5216 val_loss=0.0000 scale=2.0000 norm=3.7493
[iter 200] loss=0.5008 val_loss=0.0000 scale=2.0000 norm=3.7547
[iter 300] loss=0.4922 val_loss=0.0000 scale=1.0000 norm=1.8835
[iter 400] loss=0.4884 val_loss=0.0000 scale=2.0000 norm=3.7750
[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5228 val_loss=0.0000 scale=2.0000 norm=3.7594
[iter 200] loss=0.5029 val_loss=0.0000 scale=1.0000 norm=1.8820
[iter 300] loss=0.4959 val_loss=0.0000 scale=1.0000 norm=1.8887
[iter 400] loss=0.4917 val_loss=0.0000 scale=1.0000 norm=1.8920
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0

  overwrite_a=True).T


[iter 0] loss=0.6277 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5240 val_loss=0.0000 scale=2.0000 norm=3.7602
[iter 200] loss=0.5044 val_loss=0.0000 scale=1.0000 norm=1.8815
[iter 300] loss=0.4976 val_loss=0.0000 scale=1.0000 norm=1.8865
[iter 400] loss=0.4935 val_loss=0.0000 scale=1.0000 norm=1.8914
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5246 val_loss=0.0000 scale=2.0000 norm=3.7632
[iter 200] loss=0.5040 val_loss=0.0000 scale=1.0000 norm=1.8848
[iter 300] loss=0.4968 val_loss=0.0000 scale=1.0000 norm=1.8899
[iter 400] loss=0.4920 val_loss=0.0000 scale=1.0000 norm=1.8930
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.5246 val_loss=0.0000 scale=2.0000 norm=3.7632
[iter 200] loss=0.5040 val_loss=0.0000 scale=1.0000 norm=1.8848
[iter 300] loss=0.4968 val_loss=0.0000 scale=1.0000 norm=1.8899
[iter 400] loss=0.4920 val_loss=0.0000 scale=1.0000 norm=1.8930
[iter 0] loss=0.6278 val_loss=0.0000 scale=2.0

  overwrite_a=True).T


## Now we can create the best model based on performance and train and evaluate the model.

In [12]:
best_model = create_model(model_matrix)
best_model

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7655,0.7966,0.9231,0.7746,0.8423,0.3985,0.4227
1,0.7736,0.7718,0.9182,0.785,0.8464,0.4263,0.4457
2,0.7819,0.8169,0.9243,0.7902,0.852,0.4473,0.4677
3,0.7703,0.8091,0.9243,0.7788,0.8453,0.4117,0.4353
4,0.7786,0.8082,0.9096,0.7942,0.848,0.4472,0.4614
5,0.7952,0.827,0.9243,0.8036,0.8597,0.4871,0.5038
6,0.7645,0.7987,0.9072,0.7813,0.8395,0.4063,0.4227
7,0.7745,0.8045,0.9108,0.7892,0.8456,0.435,0.4507
8,0.7836,0.817,0.9193,0.7941,0.8521,0.4566,0.4741
9,0.7927,0.818,0.9364,0.7946,0.8597,0.4729,0.497


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=5921, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Now let's tune the model. In order to tune hyperparameters, the tune_model function is used. This function automatically tunes the hyperparameters of a model using Random Grid Search on a pre-defined search space. The output prints a score grid that shows MAE, MSE, RMSE, R2, RMSLE and MAPE by fold.

In [None]:
tuned_model = tune_model(best_model)
tuned_model

## Let's plot the feature importance. The plot_model() function can be used to analyze the  feature Importance etc. This function takes a trained model object and returns a plot based on the test / hold-out set.

In [None]:
plot_model(tuned_model, plot='feature')
#evaluate_model(tuned_model)

## Let's finalize the model for deployment. The finalize_model() function fits the model onto the complete dataset including the test/evaluation sample (10% in this case). The purpose of this function is to train the model on the complete dataset before it is deployed in production.

In [None]:
final_model = finalize_model(tuned_model)

print(final_model)

## Now we can predict on the unseen data and review some evaluation metrics to understand the model performance

In [None]:
unseen_predictions = predict_model(final_model, data=df_eval)
unseen_predictions.head()

In [None]:
accuracy = check_metric(unseen_predictions['isnextyear_buyer'], unseen_predictions['Label'], metric = 'Accuracy')
precision = check_metric(unseen_predictions['isnextyear_buyer'], unseen_predictions['Label'], metric = 'Precision')
recall = check_metric(unseen_predictions['isnextyear_buyer'], unseen_predictions['Label'], metric = 'Recall')
f1 = check_metric(unseen_predictions['isnextyear_buyer'], unseen_predictions['Label'], metric = 'F1')
auc = check_metric(unseen_predictions['isnextyear_buyer'], unseen_predictions['Label'], metric = 'AUC')

print("> MODEL PERFORMANCE CALCULATED", end="\n\n")
print(" * ACCURACY=", accuracy)
print(" * PRECISION=", precision)
print(" * RECALL=", recall)
print(" * F1= ", f1)
print(" * AUC= ", auc)

## We can now save the final model. PyCaret's inbuilt function save_model() allows you to save the model along with entire transformation pipeline for later use.

In [None]:
save_model(final_model,'../models/QA-MILB-League-Based-Retention-Model')

## Now that we have an exported model we can use it to score new data. To load a saved model at a future date in the same or an alternative environment, we would use PyCaret's load_model() function and then easily apply the saved model on new unseen data for prediction.

In [None]:
saved_final_model = load_model('../models/QA-MILB-League-Based-Retention-Model')

new_prediction = predict_model(saved_final_model, data=df_eval)
new_prediction.head()