# Generate Bat Outcome
 - Given the batter has made contact on the given pitch characteristics, what is the batting outcome?
 - I.e., what is the launch_speed_angle, and what is the hit_location

## Potential Difficulties:
 - How to factor in bunting?

In [26]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.features import build_features as f
from src.data import data_utils as du
from importlib import reload
reload(f)

vladdy = 665489
soto = 665742
schneider = 676914
biggio = 624415
batter = biggio
X_train, y_train, X_test, y_test, encoders = f.get_hit_outcome_dataset(batter, split=True)

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from bisect import bisect
from copy import deepcopy
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot
from sdv.metadata import SingleTableMetadata
from scipy import stats
import time

#X_train = X_train[['release_speed', 'distance_factor']]
#X_test = X_test[['release_speed', 'distance_factor']]

def fit_regressors(X, y):
    regressors = {}
    #fit regressors
    for col in y.columns:
        param_grid = {
            'max_depth': [3, 5],
            'min_samples_split': [2],
            'min_samples_leaf': [1]
        }
        tree = DecisionTreeRegressor()
        grid_search = GridSearchCV(estimator=tree, param_grid=param_grid, cv=5, scoring='r2')
        grid_search.fit(X, y[col])
        best_tree = grid_search.best_estimator_
        regressors[col] = deepcopy(best_tree)
    return regressors


def fit_regressors2(X, y):
    regressors = {}
    # Fit regressors
    for col in y.columns:
        param_grid = {
            'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
        }
        ridge = Ridge()
        grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, scoring='r2')
        grid_search.fit(X, y[col])
        best_ridge = grid_search.best_estimator_
        regressors[col] = deepcopy(best_ridge)
    return regressors

def regressor_diagnostics(regressors, X, y):
    for col in y.columns:
        y_pred = regressors[col].predict(X)
        r2 = r2_score(y[col], y_pred)
        mse = mean_squared_error(y[col], y_pred)
        print(f'{col}: r2 {r2:.2f}, mse {mse}')
    
def get_density(X):
    pdf, x = np.histogram(X, np.linspace(min(X.values)[0], max(X.values)[0], 1_001), density=True)
    cdf = np.cumsum(pdf) * (x[1] - x[0])
    return x, pdf, cdf
    
def generate_sample(quantile_func, n=100_000):
    return quantile_func(np.random.uniform(size=n))
    
def get_quantile_func(X):
    x, pdf, cdf = get_density(X)
    quantile = lambda val: x[bisect(cdf, val)] 
    return np.vectorize(quantile)

def compute_resids(regressors, X_train, y_train):
    resids = {}
    for col in y_train.columns:
        resids[col] = y_train[col] - regressors[col].predict(X_train)
    resids = pd.DataFrame(resids)
    return resids

def generate_joint_samples(y_train, n_samples=10_000):

    #compute resids & get marginal quantile functions
    #resids = compute_resids(regressors, X_train, y_train)

    #get quantile functions
    quantile_funcs = {}
    for col in y_train.columns:
        quantile_funcs[col] = get_quantile_func(y_train[[col]])

    #get correlation matrix
    corr_matrix = np.corrcoef(y_train, rowvar=False)
    # Step 1: Generate correlated normal samples
    normal_samples = np.random.multivariate_normal(np.zeros(3), corr_matrix, size=n_samples)
    # Step 2: Transform normal samples to uniform using the normal CDF
    uniform_samples = stats.norm.cdf(normal_samples)
    # Step 3: map back to desired space
    joint_samples = {}
    for idx, col in enumerate(y_train.columns):
        joint_samples[col] = quantile_funcs[col](uniform_samples[:, idx])
    return pd.DataFrame(joint_samples)

def run_fit_evaluation(real_data, generated_data):
    meta = SingleTableMetadata()
    meta.detect_from_dataframe(generated_data)
    
    # 1. perform basic validity checks
    diagnostic = run_diagnostic(real_data, generated_data, meta)
    
    # 2. measure the statistical similarity
    quality_report = evaluate_quality(real_data, generated_data, meta)
    
    # 3. plot the data
    for col in y_test.columns:
        fig = get_column_plot(
            real_data=real_data,
            synthetic_data=generated_data,
            metadata=meta,
            column_name=col
        )
        fig.show()
        time.sleep(0.5)

def predict_regressors(regressors, X):
    regressed_data = {}
    for col in regressors.keys():
        regressed_data[col] = regressors[col].predict(X)

    return pd.DataFrame(regressed_data)
        

#fit regressors, get residuals
#regressors = fit_regressors(X_train, y_train)

#regressor_diagnostics(regressors, X_train, y_train)
#regressor_diagnostics(regressors, X_test, y_test)

#sample_res = generate_joint_residual_samples(regressors, X_train, y_train, n_samples=len(y_test))

#print(sample_res)

#pred_data = predict_regressors(regressors, X_test) + sample_res

y_pred = generate_joint_samples(y_train)

run_fit_evaluation(y_train, y_pred)
run_fit_evaluation(y_test, y_pred)


2024-05-19 18:39:21,989 - sdv.metadata.single_table - INFO - Detected metadata:
2024-05-19 18:39:21,989 - sdv.metadata.single_table - INFO - {
    "columns": {
        "launch_speed": {
            "sdtype": "numerical"
        },
        "launch_angle": {
            "sdtype": "numerical"
        },
        "spray_angle": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}


Generating report ...

(1/2) Evaluating Data Validity: |████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1103.96it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |████████████████████████████████████████████████████| 1/1 [00:00<00:00, 815.06it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |█████████████████████████████████████████████████████| 3/3 [00:00<00:00, 364.10it/s]|
Column Shapes Score: 97.76%

(2/2) Evaluating Column Pair Trends: |████████████████████████████████████████████████| 3/3 [00:00<00:00, 155.09it/s]|
Column Pair Trends Score: 99.4%

Overall Score (Average): 98.58%



2024-05-19 18:39:23,894 - sdv.metadata.single_table - INFO - Detected metadata:
2024-05-19 18:39:23,895 - sdv.metadata.single_table - INFO - {
    "columns": {
        "launch_speed": {
            "sdtype": "numerical"
        },
        "launch_angle": {
            "sdtype": "numerical"
        },
        "spray_angle": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}


Generating report ...

(1/2) Evaluating Data Validity: |█████████████████████████████████████████████████████| 3/3 [00:00<00:00, 728.52it/s]|
Data Validity Score: 95.84%

(2/2) Evaluating Data Structure: |████████████████████████████████████████████████████| 1/1 [00:00<00:00, 490.10it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 97.92%

Generating report ...

(1/2) Evaluating Column Shapes: |█████████████████████████████████████████████████████| 3/3 [00:00<00:00, 313.62it/s]|
Column Shapes Score: 82.19%

(2/2) Evaluating Column Pair Trends: |████████████████████████████████████████████████| 3/3 [00:00<00:00, 147.24it/s]|
Column Pair Trends Score: 92.91%

Overall Score (Average): 87.55%



In [23]:
print(y_test.describe(), '\n\n',
      y_train.describe(), '\n\n',
      y_pred.describe())

       launch_speed  launch_angle  spray_angle
count     15.000000     15.000000    15.000000
mean      91.440000     21.066667    98.400000
std       10.905949     27.824621    22.516026
min       72.700000    -41.000000    48.000000
25%       86.750000     13.000000    86.000000
50%       93.700000     24.000000   104.000000
75%       99.750000     37.000000   117.000000
max      107.600000     67.000000   128.000000 

        launch_speed  launch_angle  spray_angle
count    135.000000    135.000000   135.000000
mean      89.818519     24.600000    99.437037
std       12.395866     24.043245    27.622349
min       51.600000    -45.000000    27.000000
25%       83.750000      9.000000    79.000000
50%       91.100000     24.000000   106.000000
75%       99.850000     39.000000   118.000000
max      109.300000     85.000000   232.000000 

        launch_speed  launch_angle   spray_angle
count  10000.000000  10000.000000  10000.000000
mean      89.632251     24.498754     98.935668
std 

In [18]:
run_fit_evaluation(y_train, y_pred)


2024-05-19 17:54:09,623 - sdv.metadata.single_table - INFO - Detected metadata:
2024-05-19 17:54:09,626 - sdv.metadata.single_table - INFO - {
    "columns": {
        "launch_speed": {
            "sdtype": "numerical"
        },
        "launch_angle": {
            "sdtype": "numerical"
        },
        "spray_angle": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}


Generating report ...

(1/2) Evaluating Data Validity: |█████████████████████████████████████████████████████| 3/3 [00:00<00:00, 496.39it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |████████████████████████████████████████████████████| 1/1 [00:00<00:00, 469.42it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |█████████████████████████████████████████████████████| 3/3 [00:00<00:00, 249.76it/s]|
Column Shapes Score: 97.65%

(2/2) Evaluating Column Pair Trends: |████████████████████████████████████████████████| 3/3 [00:00<00:00, 155.61it/s]|
Column Pair Trends Score: 99.44%

Overall Score (Average): 98.55%



In [10]:
print(np.corrcoef(y_train, rowvar=False))
np.corrcoef(y_pred, rowvar=False)

[[ 1.          0.32003678 -0.10351609]
 [ 0.32003678  1.          0.42257853]
 [-0.10351609  0.42257853  1.        ]]


array([[ 1.        ,  0.31553617, -0.08704386],
       [ 0.31553617,  1.        ,  0.4141389 ],
       [-0.08704386,  0.4141389 ,  1.        ]])

In [12]:
y_pred.describe()

Unnamed: 0,launch_speed,launch_angle,spray_angle
count,10000.0,10000.0,10000.0
mean,92.994955,8.362973,89.079298
std,14.65406,30.289969,22.303942
min,21.0,-81.0,48.0
25%,84.6384,-10.095,69.978
50%,96.6653,8.976,86.907
75%,104.1466,27.884,105.915
max,115.6053,81.837,146.901


In [14]:
y_train.describe()

Unnamed: 0,launch_speed,launch_angle,spray_angle
count,429.0,429.0,429.0
mean,92.833566,7.606061,88.948718
std,14.811679,30.041576,22.255256
min,21.0,-81.0,48.0
25%,84.6,-11.0,70.0
50%,96.6,8.0,87.0
75%,104.1,27.0,106.0
max,115.7,82.0,147.0
