## Transferring Model

Transferring predictive NBA matchup model from py program to juypter notebooks file.

In [None]:
# Full list of important statements that are called throughout - machine learning methods, API calls, data visualization, etc
from nba_api.stats.endpoints import leaguegamefinder, boxscoretraditionalv2
import pandas as pd
import time
import json
from nba_api.stats.static import players
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Step 1: Data Collection and Initial Processing

In this section, we fetch historical game and player data using the `nba_api` library and save it to CSV files to avoid redundant API calls in future sessions. 

We are fetching data from singular seasons--at the same time--to prevent 1280 games per season x 4 seasons = 85 minutes of fetching.

In [None]:
from nba_api.stats.endpoints import leaguegamefinder, boxscoretraditionalv2
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor
import os

# Function to fetch games for a specific season
def fetch_season_games(season):
    print(f"Fetching games for season {season}")
    gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season)
    season_games = gamefinder.get_data_frames()[0]

    time.sleep(1)  # Respectful delay to avoid rate limiting

    return season_games

# Function to fetch player stats for each game in a given season's DataFrame
def fetch_player_stats_for_games(season, games_df):
    player_stats = []

    for _, game in games_df.iterrows():
        game_id = game['GAME_ID']
        print(f"Fetching player stats for Game ID {game_id} in season {season}")

        try:
            boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
            boxscore_df = boxscore.get_data_frames()[0]
            player_stats.append(boxscore_df)

        except Exception as e:
            print(f"Error fetching data for game {game_id} in season {season}: {e}")

        time.sleep(1)  # Respectful delay
        
    return pd.concat(player_stats, ignore_index=True) if player_stats else pd.DataFrame()

# Function to process a single season (fetch games and player stats, then save to CSV)
def process_season_data(season):
    games_csv = f"games_data_{season}.csv"
    player_stats_csv = f"player_stats_data_{season}.csv"
    
    # Check if data already exists to avoid redundant fetching
    if os.path.exists(games_csv) and os.path.exists(player_stats_csv):
        print(f"Data for season {season} already exists. Skipping fetch.")
        return
    
    # Fetch games for the season
    games_df = fetch_season_games(season)
    
    # Save games data to CSV
    games_df.to_csv(games_csv, index=False)
    
    # Fetch player stats for each game in the season
    player_stats_df = fetch_player_stats_for_games(season, games_df)
    
    # Save player stats to CSV
    player_stats_df.to_csv(player_stats_csv, index=False)
    print(f"Data for season {season} saved to {games_csv} and {player_stats_csv}")

# Main function to fetch data in parallel for multiple seasons
def fetch_data_for_multiple_seasons(start_season='2020-21', end_season='2023-24'):
    seasons = [f"{year}-{str(year + 1)[-2:]}" for year in range(int(start_season.split('-')[0]), int(end_season.split('-')[0]) + 1)]
    
    # Use ThreadPoolExecutor to fetch each season's data in parallel
    with ThreadPoolExecutor(max_workers=len(seasons)) as executor:
        executor.map(process_season_data, seasons)

# Run the data fetching
fetch_data_for_multiple_seasons()

Fetching games for season 2020-21Fetching games for season 2021-22
Fetching games for season 2022-23

Fetching games for season 2023-24
Fetching player stats for Game ID 0042000406 in season 2020-21
Fetching player stats for Game ID 0042300405 in season 2023-24
Fetching player stats for Game ID 0042200405 in season 2022-23
Fetching player stats for Game ID 0042100406 in season 2021-22
Fetching player stats for Game ID 0042000406 in season 2020-21
Fetching player stats for Game ID 0042000405 in season 2020-21
Fetching player stats for Game ID 0042200405 in season 2022-23
Fetching player stats for Game ID 0042100406 in season 2021-22
Fetching player stats for Game ID 0042300405 in season 2023-24
Fetching player stats for Game ID 0042000405 in season 2020-21
Fetching player stats for Game ID 0042100405 in season 2021-22
Fetching player stats for Game ID 0042200404 in season 2022-23
Fetching player stats for Game ID 0042300404 in season 2023-24
Fetching player stats for Game ID 0042000404 

## Step 2: Load Preprocessed Data

We load the data from saved CSV files. This allows us to proceed directly to model training without querying the API each time.

In [None]:
def load_combined_data(start_season='2020-21', end_season='2023-24'):
    # Generate a list of seasons
    seasons = [f"{year}-{str(year + 1)[-2:]}" for year in range(int(start_season.split('-')[0]), int(end_season.split('-')[0]) + 1)]
    
    # Load each season's games and player stats and concatenate
    all_games = []
    all_player_stats = []
    
    for season in seasons:
        games_csv = f"games_data_{season}.csv"
        player_stats_csv = f"player_stats_data_{season}.csv"
        
        if os.path.exists(games_csv):
            games_df = pd.read_csv(games_csv)
            all_games.append(games_df)
        
        if os.path.exists(player_stats_csv):
            player_stats_df = pd.read_csv(player_stats_csv)
            all_player_stats.append(player_stats_df)
    
    # Combine all seasons' data
    combined_games_df = pd.concat(all_games, ignore_index=True) if all_games else pd.DataFrame()
    combined_player_stats_df = pd.concat(all_player_stats, ignore_index=True) if all_player_stats else pd.DataFrame()
    
    return combined_games_df, combined_player_stats_df

# Load combined data when needed
games_df, player_stats_df = load_combined_data()
print("Combined games and player stats data loaded.")


## Step 3: Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: For better visualization aesthetics
sns.set(style='whitegrid')

# Load game statistics
df_matchups = pd.read_csv('games_data_2020-21.csv')
df_players = pd.read_csv('games_data_2021-22.csv')
df_players = pd.read_csv('games_data_2022-23.csv')
df_players = pd.read_csv('games_data_2023-24.csv')

# Load team statistics
df_teams = pd.read_csv('team_stats_2023_24_nba_api.csv')

# Optionally, load game data if available
df_games = pd.read_csv('nba_games.csv')  # Replace with your actual game data file

## Step 4: Data Cleaning

In [None]:
# Check missing values
print(df_players[['Points', 'Rebounds', 'Assists']].isnull().sum())

# Option 1: Impute missing values with the mean or median
df_players['Points'].fillna(df_players['Points'].mean(), inplace=True)
df_players['Rebounds'].fillna(df_players['Rebounds'].median(), inplace=True)

# Option 2: Drop rows with missing values (if minimal)
df_players.dropna(subset=['Assists'], inplace=True)

# Verify that missing values are handled
print(df_players[['Points', 'Rebounds', 'Assists']].isnull().sum())

# Converting Data Types

# Convert 'Season' to string if not already
df_players['Season'] = df_players['Season'].astype(str)
df_teams['Season'] = df_teams['Season'].astype(str)

# Convert date columns to datetime
if 'Date' in df_games.columns:
    df_games['Date'] = pd.to_datetime(df_games['Date'])

# Remove Duplicates

# Remove duplicate player records
df_players.drop_duplicates(inplace=True)

# Remove duplicate team records
df_teams.drop_duplicates(inplace=True)

# Remove duplicate game records
if 'Game_ID' in df_games.columns:
    df_games.drop_duplicates(subset=['Game_ID'], inplace=True)

## Step 5: Data Visualization

In [None]:
# Compute correlation matrix for team statistics
corr_matrix = df_teams.corr()

# Plot heatmap
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Team Statistics')
plt.show()

## Feature Engineering

1. Calulating win-loss rations for home and away teams. 
2. Feature scaling

In [None]:
# Assuming df_games has 'Team', 'Opponent', 'Win' (binary: 1 if Team won, 0 otherwise)

# Calculate overall win-loss records
team_records = df_games.groupby('Team').agg({'Win': 'sum', 'Game_ID': 'count'}).rename(columns={'Game_ID': 'Total_Games'})
team_records['Win_Loss_Ratio'] = team_records['Win'] / team_records['Total_Games']

# Merge back to df_games for home and away teams
df_games = df_games.merge(team_records[['Win_Loss_Ratio']], left_on='Team', right_index=True, how='left')
df_games = df_games.merge(team_records[['Win_Loss_Ratio']], left_on='Opponent', right_index=True, how='left', suffixes=('_Home', '_Away'))

# Display updated DataFrame
print(df_games[['Team', 'Opponent', 'Win_Home', 'Win_Away', 'Win_Loss_Ratio_Home', 'Win_Loss_Ratio_Away']].head())

In [None]:
from sklearn.preprocessing import StandardScaler

# Identify numerical feature columns
numerical_features = ['OFF_RATING_Home', 'DEF_RATING_Home', 'OFF_RATING_Away', 'DEF_RATING_Away',
                      'Home_Off_vs_Away_Def', 'Away_Off_vs_Home_Def', 'Win_Loss_Ratio_Home', 'Win_Loss_Ratio_Away']

# Initialize scaler
scaler = StandardScaler()

# Fit and transform the numerical features
df_games[numerical_features] = scaler.fit_transform(df_games[numerical_features])

# Display scaled features
print(df_games[numerical_features].head())

In [None]:
# Feature Engineering for player props
def get_player_id_name_mapping():
    all_players = players.get_players()
    player_df = pd.DataFrame(all_players)
    return player_df[['id', 'full_name']]

def add_player_names(player_stats_df):
    player_mapping = get_player_id_name_mapping()
    player_stats_df = player_stats_df.merge(player_mapping, left_on='PLAYER_ID', right_on='id', how='left')
    player_stats_df = player_stats_df.drop(columns=['id'])
    return player_stats_df

def clean_missing_tov(player_stats_df):
    player_stats_df['TOV'] = player_stats_df['TOV'].fillna(player_stats_df['TOV'].median())
    return player_stats_df

def calculate_recent_averages(player_stats_df, n=5):
    """
    Calculate recent averages (e.g., last n games) for each player in points, rebounds, assists, etc.
    """
    # Sort data by player and game date
    player_stats_df = player_stats_df.sort_values(['PLAYER_ID', 'GAME_DATE'])
    
    # Calculate rolling averages over the last n games for each player (excluding TOV)
    player_stats_df['PTS_avg_last_n'] = player_stats_df.groupby('PLAYER_ID')['PTS'].transform(lambda x: x.rolling(n, min_periods=1).mean())
    player_stats_df['REB_avg_last_n'] = player_stats_df.groupby('PLAYER_ID')['REB'].transform(lambda x: x.rolling(n, min_periods=1).mean())
    player_stats_df['AST_avg_last_n'] = player_stats_df.groupby('PLAYER_ID')['AST'].transform(lambda x: x.rolling(n, min_periods=1).mean())
    
    # Add per-minute statistics (excluding TOV)
    player_stats_df['PTS_per_min'] = player_stats_df['PTS'] / player_stats_df['MIN']
    player_stats_df['REB_per_min'] = player_stats_df['REB'] / player_stats_df['MIN']
    player_stats_df['AST_per_min'] = player_stats_df['AST'] / player_stats_df['MIN']
    
    return player_stats_df


def integrate_team_and_player_data(team_data, player_data):
    team_player_stats = player_data.groupby(['GAME_ID', 'TEAM_ID']).agg({
        'PTS_avg_last_n': 'mean',
        'REB_avg_last_n': 'mean',
        'AST_avg_last_n': 'mean',
        'PTS_per_min': 'mean',
        'REB_per_min': 'mean',
        'AST_per_min': 'mean'
    }).reset_index()
    merged_data = team_data.merge(team_player_stats, on=['GAME_ID', 'TEAM_ID'], how='left')
    return merged_data

# Modify `get_player_stats_for_games` to apply these functions:
def get_player_stats_for_games(games_df):
    player_stats = []
    for _, game in games_df.iterrows():
        game_id = game['GAME_ID']
        try:
            boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
            boxscore_df = boxscore.get_data_frames()[0]
            boxscore_df = add_player_names(boxscore_df)
            boxscore_df = clean_missing_tov(boxscore_df)
            boxscore_df = calculate_recent_averages(boxscore_df)
            player_stats.append(boxscore_df)
        except Exception as e:
            print(f"Error fetching data for game {game_id}: {e}")
        time.sleep(1)
    player_stats_df = pd.concat(player_stats, ignore_index=True)
    return player_stats_df

## Step 6: ML Implementation
1. Insert #11 from 11/7 notes on streamlining ML process

In [None]:
# Define target variable: 'Win' (1 if Home team wins, 0 otherwise)
y = df_games['Win']

# Define feature variables by excluding non-predictive columns
# Assuming 'Win' is the target and 'Game_ID', 'Date' are non-predictive
X = df_games.drop(['Win', 'Game_ID', 'Date'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print(f"Training Labels Shape: {y_train.shape}")
print(f"Testing Labels Shape: {y_test.shape}")

In [None]:
# Logistic Regression Implementation

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
logreg = LogisticRegression(max_iter=1000)

# Train the model
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Random Forest Implementation 
def train_random_forest(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42) # initialize model
    model.fit(X_train, y_train)
    return model

# Train Random Forest Model
rf_model = train_random_forest(X_train, y_train)

# Make predictions
y_pred_rf = RFE.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\nRandom Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# Get feature importances from Random Forest
importances = RFE.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
feat_importances = pd.Series(importances, index=feature_names)
feat_importances = feat_importances.sort_values(ascending=False)

# Plot feature importances
plt.figure(figsize=(12,8))
sns.barplot(x=feat_importances[:20], y=feat_importances.index[:20])
plt.title('Top 20 Feature Importances from Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

In [None]:
# Evaluation Metrics

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]  # Probability for the positive class (Win)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"ROC-AUC: {roc_auc:.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Evaluate the Random Forest Model
print("Random Forest Performance:")
evaluate_model(rf_model, X_test, y_test)


# Integration of a Precision-Recall Curve
from sklearn.metrics import precision_recall_curve

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Plot Precision-Recall Curve
plt.figure(figsize=(8,6))
plt.plot(recall, precision, label='Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='upper right')
plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=RFE.classes_) #edited from rf to RFE
disp.plot(cmap=plt.cm.Blues)
plt.title('Random Forest Confusion Matrix')
plt.show()

## Step 7: Hyperparameter Tuning
1. RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define parameter distribution
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None] + list(range(10, 31, 10)),
    'min_samples_split': randint(2, 11)
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RFE, param_distributions=param_dist,
                                   n_iter=50, cv=5, n_jobs=-1, verbose=2, scoring='accuracy',
                                   random_state=42)

# Perform random search
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters from Random Search:", random_search.best_params_)

# Best estimator
best_rf_random = random_search.best_estimator_

# Make predictions
y_pred_best_rf_random = best_rf_random.predict(X_test)

# Evaluate
print("Random Search Best Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_rf_random))
print("\nRandom Search Best Random Forest Classification Report:\n", classification_report(y_test, y_pred_best_rf_random))

In [None]:
import optuna
from sklearn.metrics import accuracy_score

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

# Create a study object
study = optuna.create_study(direction='maximize')

# Optimize
study.optimize(objective, n_trials=50)

# Best parameters
print("Best Parameters:", study.best_params_)

# Train final model with best parameters
best_rf_optuna = RandomForestClassifier(**study.best_params_, random_state=42)
best_rf_optuna.fit(X_train, y_train)

# Evaluate
y_pred_best_optuna = best_rf_optuna.predict(X_test)
print("Optimized Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_optuna))

## Step 8: Trained Model + Future Predictions

In [None]:
import joblib

# Save the model to a file
joblib.dump(best_rf_random, 'best_random_forest_model.joblib')

# Later, load the model
loaded_model = joblib.load('best_random_forest_model.joblib')

# Make predictions with the loaded model
y_pred_loaded = loaded_model.predict(X_test)
print("Loaded Model Accuracy:", accuracy_score(y_test, y_pred_loaded))

In [None]:
# Preparing new game data

# Synthetic, example data
new_game = {
    'OFF_RATING_Home': 110.5,
    'DEF_RATING_Home': 105.3,
    'OFF_RATING_Away': 108.2,
    'DEF_RATING_Away': 106.7,
    'Home_Off_vs_Away_Def': 110.5 - 106.7,
    'Away_Off_vs_Home_Def': 108.2 - 105.3,
    'Win_Loss_Ratio_Home': 0.6,
    'Win_Loss_Ratio_Away': 0.55,
    # Include encoded team features
    'Team_Lakers': 1,  # Example for 'Team' one-hot encoding
    'Opponent_Bulls': 1  # Example for 'Opponent' one-hot encoding
    # Add other necessary features
}

# Convert to DataFrame
new_game_df = pd.DataFrame([new_game])

# Scale numerical features using the same scaler
new_game_df[numerical_features] = scaler.transform(new_game_df[numerical_features])

# Align the new game features with the training features
# Ensure that all feature columns are present
missing_cols = set(X.columns) - set(new_game_df.columns)
for col in missing_cols:
    new_game_df[col] = 0  # or appropriate default value

# Reorder columns to match training data
new_game_df = new_game_df[X.columns]

# Display the prepared new game data
print(new_game_df.head())


In [None]:
# Predict the outcome
prediction = loaded_model.predict(new_game_df)
prediction_proba = loaded_model.predict_proba(new_game_df)[:,1]

# Interpret the prediction
if prediction[0] == 1:
    print("Home Team is predicted to win.")
else:
    print("Away Team is predicted to win.")

print(f"Prediction Probability: {prediction_proba[0]:.2f}")

## Step 9: Deployment to website

In [None]:
# Step 1d: Merging game data and player data.
def integrate_game_and_player_data(games_df, player_stats_df):
    # Merge player statistics with the game data
    merged_df = games_df.merge(player_stats_df, on=['GAME_ID', 'TEAM_ID'], how='inner', suffixes=('', '_player'))
    
    # Ensure we have one row per game with both teams' statistics
    game_data = merged_df.pivot(index='GAME_ID', columns='TEAM_ID')
    
    # Flatten multi-level columns and rename for clarity
    game_data.columns = ['_'.join(map(str, col)).strip() for col in game_data.columns.values]
    game_data.reset_index(inplace=True)
    
    return game_data

# Integrate game-level data with player statistics
final_game_data = integrate_game_and_player_data(games_df, player_stats_df)

In [None]:
# Step 2: Prepare the DataFrame
games = load_combined_data()
X = games[['PTS', 'FG_PCT', 'FT_PCT', 'REB', 'AST', 'TOV', 'PLUS_MINUS']]
y = games['WIN']

In [None]:
# Step 3: Set up a Pipeline with Imputation and Scaling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())                 # Scale features
])

# Apply the pipeline to transform the feature data
X_preprocessed = pipeline.fit_transform(X)

In [None]:
# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [None]:
# Step 7: Predicting Winners for Upcoming Games
def predict_upcoming_games(game_data_list, model, pipeline):
    """
    Predict the winners of upcoming games.
    game_data_list: List of dictionaries, each with keys 'PTS', 'FG_PCT', 'FT_PCT', 'REB', 'AST', 'TOV', 'PLUS_MINUS'.
    model: Trained model for prediction.
    pipeline: Pipeline object used during training for preprocessing.
    """
    # Convert list of game data dictionaries to DataFrame
    new_games_df = pd.DataFrame(game_data_list)
    new_games_preprocessed = pipeline.transform(new_games_df)
    
    predictions = model.predict(new_games_preprocessed)
    prediction_probabilities = model.predict_proba(new_games_preprocessed)[:, 1]  # Probability of Win

    results = []
    for i, pred in enumerate(predictions):
        result = {
            "Game": i + 1,
            "Prediction": "Win" if pred == 1 else "Loss",
            "Win Probability": prediction_probabilities[i]
        }
        results.append(result)
    return results

# Example usage for predicting upcoming games
upcoming_games = [
    {
        'PTS': 112,
        'FG_PCT': 0.47,
        'FT_PCT': 0.76,
        'REB': 43,
        'AST': 24,
        'TOV': 13,
        'PLUS_MINUS': 4
    },
    {
        'PTS': 108,
        'FG_PCT': 0.45,
        'FT_PCT': 0.74,
        'REB': 40,
        'AST': 22,
        'TOV': 15,
        'PLUS_MINUS': -3
    }
]

# Predict winners for upcoming games
predictions = predict_upcoming_games(upcoming_games, rf_model, pipeline)
print("Predictions for Upcoming Games:")
for prediction in predictions:
    print(prediction)

In [None]:
# Step 11: Integrate Game Data with Aggregated Player Data
def integrate_game_and_player_data(games_df, player_stats_df):
    # Merge player statistics with the game data
    merged_df = games_df.merge(player_stats_df, on=['GAME_ID', 'TEAM_ID'], how='inner', suffixes=('', '_player'))
    
    # Ensure we have one row per game with both teams' statistics
    game_data = merged_df.pivot(index='GAME_ID', columns='TEAM_ID')
    
    # Flatten multi-level columns and rename for clarity
    game_data.columns = ['_'.join(map(str, col)).strip() for col in game_data.columns.values]
    game_data.reset_index(inplace=True)
    
    return game_data

In [None]:
# Step 12: Train the Model with Historical Data
def train_model_with_historical_data():
    # Fetch data for games + individual player data since the Bubble SZN
    games_df = get_historical_game_data()
    player_stats_df = get_player_stats_for_games(games_df)

    # Integrate game-level and player-level statistics
    final_game_data = integrate_game_and_player_data(games_df, player_stats_df)

    # Prepare features and target for training
    X = final_game_data.drop(columns=['WIN'])  # Replace 'WIN' with the actual target column name
    y = final_game_data['WIN']

    # Preprocess and split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Evaluate model
    y_pred = rf_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    return rf_model

# Train the model with historical data
if __name__ == "__main__":
    print("Training model with historical data...")
    model = train_model_with_historical_data()