In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    LabelEncoder,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [12]:
# Read cleaned datasets
pitchers = pd.read_csv("datasets/clean/pitchers.csv")
batters = pd.read_csv("datasets/clean/batters.csv")
plate_apps = pd.read_csv("datasets/clean/plate_apps.csv")

In [None]:
def preprocess_data(data, group_cols):
    numeric_cols = data.select_dtypes(include='number').columns.difference(group_cols)
    categorical_cols = data.select_dtypes(exclude='number').columns.difference(group_cols)

    # Define the preprocessing transformers list
    transformers = [('num', StandardScaler(), numeric_cols)]
    
    # Add categorical encoding only if categorical_cols is not empty
    if len(categorical_cols) > 0:
        transformers.append(('cat', OneHotEncoder(drop='first'), categorical_cols))

    # Define the preprocessing pipeline
    preprocessor = ColumnTransformer(transformers)

    # Define a function to apply the transformations to each group
    def preprocess_group(group):
        # Apply the column transformations using the pipeline
        transformed_data = preprocessor.fit_transform(group)  # Don't drop group_cols

        # Convert the result back into a DataFrame
        transformed_df = pd.DataFrame(transformed_data, index=group.index)

        # Construct column names
        column_names = list(numeric_cols)
        if len(categorical_cols) > 0:
            column_names += list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols))

        transformed_df.columns = column_names

        return transformed_df
    

    # Group by 'id' and 'year', then apply the preprocessing
    df_processed = data.groupby(group_cols, group_keys=True).apply(preprocess_group, include_groups=False)
    

    return df_processed

In [None]:
# Preprocess node data for modeling 
pitchers_processed = preprocess_data(pitchers.set_index("key_retro"), ["year"])
batters_processed = preprocess_data(batters.set_index("key_retro"), ["year"])

In [51]:
def preprocess_edge_data(data):
    transformers = [
        ("ordinal", OrdinalEncoder(), ["inning", "lp", "outs_pre"]),
        ("num_std", StandardScaler(), ["nump"]),
        ("num_minmax", MinMaxScaler(), ["num_times_faced_in_game"]),
    ]

    preprocessor = ColumnTransformer(transformers, remainder="passthrough")

    transformed_data = preprocessor.fit_transform(data)

    transformed_columns = [
        "inning",
        "lp",
        "outs_pre",
        "nump",
        "num_times_faced_in_game",
    ]
    passthrough_columns = [
        col for col in plate_apps.columns if col not in transformed_columns
    ]

    # Combine column names
    all_columns = transformed_columns + passthrough_columns

    # Convert to DataFrame
    transformed_df = pd.DataFrame(transformed_data, columns=all_columns)

    return transformed_df

In [53]:
plate_apps_processed = preprocess_edge_data(plate_apps)