In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

# Load datasets
file_path_hitters = '/Users/stephenak24/Downloads/2023_hitters.csv'
file_path_owar = '/Users/stephenak24/Downloads/baseball_2023_war.csv'
file_path_pulled_fb = '/Users/stephenak24/Downloads/pulled_flyball_2023.csv'
file_path_straightaway_fb = '/Users/stephenak24/Downloads/straightaway_flyball_2023.csv'
file_path_oppo_fb = '/Users/stephenak24/Downloads/oppo_flyball_2023.csv'
file_path_merged_success = '/Users/stephenak24/Downloads/merged_success_v3.csv'

df_hitters = pd.read_csv(file_path_hitters)
df_owar = pd.read_csv(file_path_owar)
df_pulled_fb = pd.read_csv(file_path_pulled_fb)
df_straightaway_fb = pd.read_csv(file_path_straightaway_fb)
df_oppo_fb = pd.read_csv(file_path_oppo_fb)
df_merged_success = pd.read_csv(file_path_merged_success)

# Create a temporary name column without suffixes for merging
def remove_suffix(name):
    suffixes = [' Jr.', ' III', ' II']
    for suffix in suffixes:
        if name.endswith(suffix):
            return name.replace(suffix, '')
    return name

df_hitters['TempName'] = df_hitters['Name'].apply(remove_suffix)
df_merged_success['TempName'] = df_merged_success['Name'].apply(remove_suffix)

# Merge datasets ensuring 'TempName' is included only once
df_hitters = df_hitters.rename(columns=lambda x: f"{x}_hitters" if x not in ['player_id', 'TempName'] else x)
df_merged_success = df_merged_success.rename(columns=lambda x: f"{x}_merged" if x not in ['player_id', 'TempName'] else x)

merged_final_data = pd.merge(
    df_hitters, 
    df_merged_success, 
    on=['player_id', 'TempName'], how='inner'
)

print(f"Player count after first merge: {len(merged_final_data)}")

# Drop the TempName column
merged_final_data.drop(columns=['TempName'], inplace=True)

# Merging other datasets
merged_final_data = pd.merge(merged_final_data, df_oppo_fb[['player_id', 'oppo_fb%']], on='player_id', how='left')
print(f"Player count after merging oppo_fb: {len(merged_final_data)}")

merged_final_data = pd.merge(merged_final_data, df_pulled_fb[['player_id', 'pulled_fb%']], on='player_id', how='left')
print(f"Player count after merging pulled_fb: {len(merged_final_data)}")

merged_final_data = pd.merge(merged_final_data, df_straightaway_fb[['player_id', 'straightaway_fb%']], on='player_id', how='left')
print(f"Player count after merging straightaway_fb: {len(merged_final_data)}")

merged_final_data = pd.merge(merged_final_data, df_owar[['player_id', 'oWAR']], on='player_id', how='left')
print(f"Player count after merging oWAR: {len(merged_final_data)}")

# Adding oWAR values for missing players
manual_owar = {
    667670: 2.6,  # Brent Rooker
    669004: 1.1,  # MJ Melendez
    650333: 4.7,  # Luis Arraez
    670770: 3.7   # TJ Friedl
}

for player_id, owar_value in manual_owar.items():
    merged_final_data.loc[merged_final_data['player_id'] == player_id, 'oWAR'] = owar_value

# Drop NaN values in the 'oWAR' column
merged_final_data.dropna(subset=['oWAR'], inplace=True)
print(f"Player count after dropping rows with missing oWAR: {len(merged_final_data)}")

# Define metrics for BONDS Index
metrics = [
    'Brls/BBE%_merged', 
    '50th_max_velo_merged', 
    'O-Swing%_merged', 
    'SwStr%_merged', 
    'Z-Swing%_merged', 
    'Z-Contact%_merged', 
    'pulled_fb%', 
    'straightaway_fb%', 
    'oppo_fb%'
]

# Checking if all metrics are in the dataframe
missing_metrics = [metric for metric in metrics if metric not in merged_final_data.columns]
if missing_metrics:
    print(f"Missing metrics: {missing_metrics}")
else:
    # Standardize the metrics
    scaler = StandardScaler()
    X = scaler.fit_transform(merged_final_data[metrics])

    # Define the target variable
    y = merged_final_data['oWAR']

    # Define the Ridge regression model
    ridge = Ridge(alpha=5)
    ridge.fit(X, y)
    coefficients = ridge.coef_

    # Normalize the coefficients to sum up to 1
    weights = coefficients / np.sum(np.abs(coefficients))

    constrained_weights = {metric: weight for metric, weight in zip(metrics, weights)}

    # Calculate the BONDS Index for each player
    merged_final_data['BONDS_Index'] = sum(merged_final_data[metric] * constrained_weights[metric] for metric in metrics)

    # Apply the adjusted scaling for the BONDS Index
    range_bonds_index = merged_final_data['BONDS_Index'].max() - merged_final_data['BONDS_Index'].min()
    mean_bonds_index = merged_final_data['BONDS_Index'].mean()
    merged_final_data['Adjusted Scaling'] = 50 + ((merged_final_data['BONDS_Index'] - mean_bonds_index) / range_bonds_index) * 50
