In [818]:
import pandas as pd
import numpy as np
from scipy import stats
from data_manager import DataManager

In [819]:
dm = DataManager()

In [820]:
def get_features(player_name, target_stat):
    team_stats = dm.get_team_rolling_stats(player_name)
    if not team_stats:
        return None
    team_stats_df = pd.concat(team_stats, axis=0).reset_index(drop=True)
    player_id = dm.get_player_id(player_name)
    player_stats = dm.get_and_save_player_data(player_id)[::-1]
    averaged_player_stats = rolling_player_stats(player_stats, target_stat=target_stat, window_size=5)
    averaged_player_stats['date'] = pd.to_datetime(averaged_player_stats['date'])
    team_stats_df['date'] = pd.to_datetime(team_stats_df['date'])
    merged = pd.merge(averaged_player_stats, team_stats_df, on='date', how='inner', suffixes=[None, "_team"])
    return merged

def get_features_corr_matrix(df, target_col):
    df[target_col] = df[target_col].astype(float)
    df.loc[:, target_col] = df[target_col].shift(1)
    df.replace("", np.nan, inplace=True)
    df.dropna(inplace=True)
    correlation_matrix = df.corr()
    return correlation_matrix[target_col]


def get_stat_corrs(player_name, stat):
    merged = get_features(player_name)
    if merged is None or merged.empty:
        return None
    corr = get_features_corr_matrix(merged, stat)
    return player_name, corr


def rolling_player_stats(player_stats, target_stat, average_method="median", window_size=5):
    player_stats['target'] = player_stats[target_stat]
    stats_to_roll_cols = ['minutes', 'points', 'rebounds',
        'assists', 'efg', 'fg3a', 'fg3m', 'fg3_pct', 'fga', 'fgm', 'fta',
        'ft_pct', 'steals', 'blocks']
    stats_to_roll = player_stats.copy()[stats_to_roll_cols]
    z_scores = np.abs(stats.zscore(stats_to_roll))
    # Define a threshold for identifying outliers
    # Filter out rows with any column's z-score exceeding the threshold
    stats_to_roll = stats_to_roll[(z_scores < 3).all(axis=1)]
    if average_method == "median":
        rolling_averages = stats_to_roll.shift(1).rolling(window=window_size).median()
    elif average_method == "mean":
        rolling_averages = stats_to_roll.shift(1).rolling(window=window_size).mean()
    rolling_averages['date'] = player_stats['date']
    rolling_averages['target'] = player_stats[target_stat]
    rolling_averages.replace("", np.nan, inplace=True)
    rolling_averages.dropna(inplace=True)
    return rolling_averages

In [821]:
player_name = "Sam Hauser"
target_stat = "points"
features = get_features(player_name=player_name, target_stat=target_stat)
print(features.columns)

Index(['minutes', 'points', 'rebounds', 'assists', 'efg', 'fg3a', 'fg3m',
       'fg3_pct', 'fga', 'fgm', 'fta', 'ft_pct', 'steals', 'blocks', 'date',
       'target', 'efg_team', 'fg3a_team', 'fg3_pct_team', 'fga_team',
       'fta_team', 'ft_pct_team', 'steals_team', 'blocks_team', 'to', 'pace',
       'e_def_rating', 'e_off_rating', 'opp_efg', 'opp_fg3a', 'opp_fg3_pct',
       'opp_fga', 'opp_fta', 'opp_ft_pct', 'opp_steals', 'opp_blocks',
       'opp_to', 'opp_pace', 'opp_e_def_rating', 'opp_e_off_rating'],
      dtype='object')


In [822]:
data = features[['minutes', 'points', 'opp_pace', 'opp_e_def_rating', 'opp_e_off_rating', 'pace', 'target']]
# Define the split ratio
train_size = 0.8

# Determine the split index
split_index = int(len(data) * train_size)

# Split the DataFrame
train_df = data.iloc[:split_index]
# z_scores = np.abs(stats.zscore(train_df))
# # Set a threshold (commonly 3)
# threshold = 3
# train_df = train_df[(z_scores < threshold).all(axis=1)]

test_df = data.iloc[split_index:]




In [823]:
y_train = train_df['target']
y_train_binary = pd.Series(np.where(y_train > y_train.median(), 1, 0))
X_train = train_df.drop(['target'], axis=1)

y_test = test_df['target']
y_test_binary = pd.Series(np.where(y_test > y_test.median(), 1, 0))
X_test = test_df.drop(['target'], axis=1)


In [824]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
# rf_model = RandomForestRegressor(n_estimators=4000, random_state=42)
# rf_model.fit(X_train, y_train)
from scipy.special import expit 
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=3, n_estimators=100)

# xg_reg.fit(X_train, y_train)

# Train the model
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)

# Train the model


In [825]:
y_pred = pd.Series(y_pred)
results = pd.concat([y_pred, y_test.reset_index(drop=True), y_test_binary], axis=1)
results.columns = ['y_pred', 'y_test', 'y_test_binary']
print(y_test.median())

5.0


In [826]:
results.to_csv('test.csv')
print(results)

       y_pred  y_test  y_test_binary
0    9.398875       8              1
1   11.126532      16              1
2    5.913223      12              1
3   12.782369       6              1
4    9.734514       5              0
5   11.599652       0              0
6    6.381793      17              1
7    7.716197       0              0
8    7.707021       2              0
9    9.410731       6              1
10  12.977635       2              0
11  11.417250       5              0
12  10.027883       0              0
13  10.027883       2              0
14  15.161420       0              0
