# Setup

In [104]:
# basic modules
import os
import time
import random as rn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# plotting style
plt.style.use('seaborn-v0_8-notebook')
sns.set_style('white')
#sns.set_style('darkgrid')

# pandas tricks for better display
pd.options.display.max_columns = 50  
pd.options.display.max_rows = 500     
pd.options.display.max_colwidth = 100
pd.options.display.precision = 3

# preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# warnings
import warnings
warnings.filterwarnings("ignore")

# user defined functions
import utility_functions as utl

# Synthetic Data

In [105]:
import pandas as pd
from datetime import timedelta, datetime

# Create a synthetic dataset
data = {
    "SEASON_ID": [],
    "GAME_ID": [],
    "GAME_DATE": [],
    "HOME_TEAM_NAME": [],
    "AWAY_TEAM_NAME": [],
    "HOME_STAT1": [],
    "AWAY_STAT1": [],
    "HOME_STAT2": [],
    "AWAY_STAT2": [],
    "HOME_STAT3": [],
    "AWAY_STAT3": []
}

# Define some parameters for the synthetic data
teams = ["Team A", "Team B"]
seasons = ["Season 1", "Season 2"]
start_date = datetime(2022, 1, 1)
games_per_team_per_season = 30 // (len(teams) * len(seasons))

# Generate the data
game_id = 1
for season in seasons:
    for team_home in teams:
        for team_away in teams:
            if team_home != team_away:
                for game in range(games_per_team_per_season):
                    game_date = start_date + timedelta(days=game_id - 1)
                    data["SEASON_ID"].append(season)
                    data["GAME_ID"].append(f"Game{game_id}")
                    data["GAME_DATE"].append(game_date.strftime("%Y-%m-%d"))
                    data["HOME_TEAM_NAME"].append(team_home)
                    data["AWAY_TEAM_NAME"].append(team_away)
                    data["HOME_STAT1"].append(game_id * 2)
                    data["AWAY_STAT1"].append(game_id * 2 + 1)
                    data["HOME_STAT2"].append(game_id * 3)
                    data["AWAY_STAT2"].append(game_id * 3 + 1)
                    data["HOME_STAT3"].append(game_id * 4)
                    data["AWAY_STAT3"].append(game_id * 4 + 1)
                    game_id += 1

# Create DataFrame
df = pd.DataFrame(data)

# Adding the 'TOTAL_POINTS' column with random integers in the range [150, 250]
df['TOTAL_POINTS'] = np.random.randint(150, 251, size=len(df))

print(df.shape)
df

(28, 12)


Unnamed: 0,SEASON_ID,GAME_ID,GAME_DATE,HOME_TEAM_NAME,AWAY_TEAM_NAME,HOME_STAT1,AWAY_STAT1,HOME_STAT2,AWAY_STAT2,HOME_STAT3,AWAY_STAT3,TOTAL_POINTS
0,Season 1,Game1,2022-01-01,Team A,Team B,2,3,3,4,4,5,161
1,Season 1,Game2,2022-01-02,Team A,Team B,4,5,6,7,8,9,238
2,Season 1,Game3,2022-01-03,Team A,Team B,6,7,9,10,12,13,230
3,Season 1,Game4,2022-01-04,Team A,Team B,8,9,12,13,16,17,248
4,Season 1,Game5,2022-01-05,Team A,Team B,10,11,15,16,20,21,176
5,Season 1,Game6,2022-01-06,Team A,Team B,12,13,18,19,24,25,211
6,Season 1,Game7,2022-01-07,Team A,Team B,14,15,21,22,28,29,227
7,Season 1,Game8,2022-01-08,Team B,Team A,16,17,24,25,32,33,190
8,Season 1,Game9,2022-01-09,Team B,Team A,18,19,27,28,36,37,174
9,Season 1,Game10,2022-01-10,Team B,Team A,20,21,30,31,40,41,212


# Prepare Data

In [106]:
prefix = 'HOME_' if 'HOME' in 'HOME_TEAM_NAME' else 'AWAY_'

stats_cols = ['HOME_STAT1',	'AWAY_STAT1',	'HOME_STAT2',	'AWAY_STAT2',	'HOME_STAT3',	'AWAY_STAT3']

# filter the stats columns based on the prefix
filtered_stats_cols = [col for col in stats_cols if col.startswith(prefix)]

In [107]:
filtered_stats_cols

['HOME_STAT1', 'HOME_STAT2', 'HOME_STAT3']

In [108]:
sort_cols = ['HOME_TEAM_NAME', 'SEASON_ID', 'GAME_DATE']
df_sorted = df.sort_values(by=sort_cols).set_index(['GAME_ID', 'GAME_DATE'])

In [109]:
group_cols1 = 'HOME_TEAM_NAME'
group_cols2 = ['HOME_TEAM_NAME', 'SEASON_ID']

In [110]:
df_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,SEASON_ID,HOME_TEAM_NAME,AWAY_TEAM_NAME,HOME_STAT1,AWAY_STAT1,HOME_STAT2,AWAY_STAT2,HOME_STAT3,AWAY_STAT3,TOTAL_POINTS
GAME_ID,GAME_DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Game1,2022-01-01,Season 1,Team A,Team B,2,3,3,4,4,5,161
Game2,2022-01-02,Season 1,Team A,Team B,4,5,6,7,8,9,238
Game3,2022-01-03,Season 1,Team A,Team B,6,7,9,10,12,13,230
Game4,2022-01-04,Season 1,Team A,Team B,8,9,12,13,16,17,248
Game5,2022-01-05,Season 1,Team A,Team B,10,11,15,16,20,21,176
Game6,2022-01-06,Season 1,Team A,Team B,12,13,18,19,24,25,211
Game7,2022-01-07,Season 1,Team A,Team B,14,15,21,22,28,29,227
Game15,2022-01-15,Season 2,Team A,Team B,30,31,45,46,60,61,177
Game16,2022-01-16,Season 2,Team A,Team B,32,33,48,49,64,65,163
Game17,2022-01-17,Season 2,Team A,Team B,34,35,51,52,68,69,176


# Experiment 1

In [115]:
roll_stats = (df_sorted.groupby(group_cols2)[filtered_stats_cols]
                       .rolling(window=3, min_periods=3)
                       .mean()
                       .groupby(group_cols2)
                       .shift(1)
             )
roll_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,HOME_STAT1,HOME_STAT2,HOME_STAT3
HOME_TEAM_NAME,SEASON_ID,GAME_ID,GAME_DATE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Team A,Season 1,Game1,2022-01-01,,,
Team A,Season 1,Game2,2022-01-02,,,
Team A,Season 1,Game3,2022-01-03,,,
Team A,Season 1,Game4,2022-01-04,4.0,6.0,8.0
Team A,Season 1,Game5,2022-01-05,6.0,9.0,12.0
Team A,Season 1,Game6,2022-01-06,8.0,12.0,16.0
Team A,Season 1,Game7,2022-01-07,10.0,15.0,20.0
Team A,Season 2,Game15,2022-01-15,,,
Team A,Season 2,Game16,2022-01-16,,,
Team A,Season 2,Game17,2022-01-17,,,


# Experiment 2

In [116]:
roll_stats = (df_sorted.groupby(group_cols2)[filtered_stats_cols]
                       .rolling(window=3, min_periods=1)
                       .mean()
                       .groupby(group_cols2)
                       .shift(1)
             )
roll_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,HOME_STAT1,HOME_STAT2,HOME_STAT3
HOME_TEAM_NAME,SEASON_ID,GAME_ID,GAME_DATE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Team A,Season 1,Game1,2022-01-01,,,
Team A,Season 1,Game2,2022-01-02,2.0,3.0,4.0
Team A,Season 1,Game3,2022-01-03,3.0,4.5,6.0
Team A,Season 1,Game4,2022-01-04,4.0,6.0,8.0
Team A,Season 1,Game5,2022-01-05,6.0,9.0,12.0
Team A,Season 1,Game6,2022-01-06,8.0,12.0,16.0
Team A,Season 1,Game7,2022-01-07,10.0,15.0,20.0
Team A,Season 2,Game15,2022-01-15,,,
Team A,Season 2,Game16,2022-01-16,30.0,45.0,60.0
Team A,Season 2,Game17,2022-01-17,31.0,46.5,62.0
