# Yankees - Event Propensity - Next Event Buyer Scoring
* Ryan Kazmerik, Nakisa Rad, Joey Lai, Shawn Sutherland, Matt Bahler, Pat Faith
* Feb 22, 2022

In [None]:
import boto3
import json
import pandas as pd
import os

from pycaret.classification import *

boto3.setup_default_session(profile_name='Stellaralgo-DataScienceAdmin')

In [None]:
def get_file_list(bucket):
    
    file_list = []
    for object_summary in my_bucket.objects.filter(Prefix="hold/ml_data/game2022"):
        file_list.append(object_summary.key)

    return file_list

In [None]:
def generate_dataframe(date, num):
    
    s3 = boto3.client('s3')
    bucket = "stellar-redshift-etl"
    key = f"hold/ml_data/game{date}000{num}_part_00"
    
    obj = s3.get_object(Bucket=bucket, Key=key)
    df_prev_game = pd.read_csv(obj['Body'])
    
    df_prev_game["game date"] = date
    
    return df_prev_game

In [None]:
def get_scores(df_prev_game):
    
    for col in df_prev_game.columns:
        if col != 'distanceToVenue':
            df_prev_game[col].fillna(0, inplace=True)

    saved_model= load_model('./models/MLB Yankees - Event Propensity (22Feb2022)')
    
    df_inference = predict_model(saved_model, data=df_prev_game, raw_score=True)

    return df_inference

In [None]:
def get_top_scores(df_inference):
    
    scoring_dict = df_inference.to_dict('records')
    
    max_dict = {}
    for record in scoring_dict:
        if record['dimCustomerMasterId'] not in max_dict:
            max_dict[record['dimCustomerMasterId']] = 0
    
        if record['Score_1'] > max_dict[record['dimCustomerMasterId']]:
            max_dict[record['dimCustomerMasterId']] = record['Score_1']
        
    top_scores = [{'id': k, 'score': v} for k, v in max_dict.items()]
    top_scores.sort(key = lambda v: -v["score"])

    return top_scores

In [None]:
def export_scores(top_scores, date, num):
    
    df_max_scores = pd.DataFrame(top_scores)
    
    parent_dir = "./results"
    directory = date
    path = os.path.join(parent_dir, directory)
    
    if not os.path.exists(path):
        os.makedirs(path)
        
    df_max_scores.to_csv(f"{path}/{date}000{num}_part_00-results.csv")

### Main Function

In [None]:
s3 = boto3.resource('s3')
my_bucket = s3.Bucket('stellar-redshift-etl')

file_list = get_file_list(my_bucket)

for filename in file_list:
    
    date = filename[17:27]
    num = filename[30:31]
    
    df_prev_game = generate_dataframe(date, num)
    df_inference = get_scores(df_prev_game)
    top_scores = get_top_scores(df_inference)

    export_scores(top_scores, date, num)
    print(f"Scores generated successfully: results/{date}/{date}000{num}_part_00", end="\n\n")    

Transformation Pipeline and Model Successfully Loaded
Scores generated successfully: results/2022-04-07/2022-04-070000_part_00

Transformation Pipeline and Model Successfully Loaded
Scores generated successfully: results/2022-04-07/2022-04-070001_part_00

Transformation Pipeline and Model Successfully Loaded
Scores generated successfully: results/2022-04-07/2022-04-070002_part_00

Transformation Pipeline and Model Successfully Loaded
Scores generated successfully: results/2022-04-07/2022-04-070003_part_00

Transformation Pipeline and Model Successfully Loaded
Scores generated successfully: results/2022-04-09/2022-04-090000_part_00

Transformation Pipeline and Model Successfully Loaded
Scores generated successfully: results/2022-04-09/2022-04-090001_part_00

Transformation Pipeline and Model Successfully Loaded
Scores generated successfully: results/2022-04-09/2022-04-090002_part_00

Transformation Pipeline and Model Successfully Loaded
Scores generated successfully: results/2022-04-09/

KeyboardInterrupt: 