In [714]:
import pandas as pd
import numpy as np
from sqlalchemy import select
from scipy.stats import poisson
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from data_manager import DataManager
from models import TradPlayerStats, Game

In [715]:
def poisson_prob_above_threshold(rolling_mean, threshold):
    cdf = poisson.cdf(threshold, rolling_mean)
    prob_above_threshold = 1 - cdf
    return prob_above_threshold

def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    conf_matrix = confusion_matrix(y_true, y_pred)
    return accuracy, precision, recall, f1, conf_matrix

def train_and_evaluate(df, model):
    X = df[['pts']]
    y = df['y']
    print(X)
    print(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = np.where(y_pred > 0.5, 1, 0)  # Threshold for binary classification
    
    return evaluate_model(y_test, y_pred)



In [716]:
dm = DataManager()

In [717]:
engine = dm.get_engine()


In [718]:
window_size = 10
player_id = dm.get_player_id("Stephen Curry")
decision_threshold = 0.52

In [719]:
stmt = (
    select(TradPlayerStats, Game.date, Game.season, Game.season_type)
    .join(Game, TradPlayerStats.game_id == Game.id)
    .where(TradPlayerStats.player_id == player_id)
    .order_by(Game.date)
)

data = pd.read_sql(stmt, engine)
dfs = {season: df_season for season, df_season in data.groupby('season')}
results = []


In [720]:
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import numpy as np

def balance_dataframe_undersample(df, target_column):
    class_counts = df[target_column].value_counts()
    if len(class_counts) < 2:
        print(f"Skipping balancing for DataFrame due to only one class present")
        return df

    # Identify the majority and minority classes
    majority_class = class_counts.idxmax()
    minority_class = class_counts.idxmin()

    df_majority = df[df[target_column] == majority_class]
    df_minority = df[df[target_column] == minority_class]

    # If either class is empty, return the original DataFrame
    if len(df_majority) == 0 or len(df_minority) == 0:
        print(f"Skipping balancing for DataFrame due to no instances of one class: {len(df_majority)} majority, {len(df_minority)} minority")
        return df

    # Undersample the majority class to match the minority class size only if majority class is larger
    difference = len(df_majority) - len(df_minority)
    df_majority = df_majority.sample(frac=1)  # Shuffle
    print(len(df_majority))
    df_majority = df_majority.head(len(df_majority) - difference).reset_index(drop=True)
    print(len(df_majority))
    df_balanced = pd.concat([df_majority, df_minority], axis=0).reset_index(drop=True)
    print(df_balanced['positive_true'].value_counts())
    return df_balanced



# Balance each DataFrame in the dictionary and calculate metrics
for season, df in dfs.items():
    df = df[['pts', 'date']]
    threshold =  df.pts.median() + 2
    df['y'] = df['pts'].shift(-1)
    df['y'] = df['y']
    df = df.dropna()
    dfs[season] = df 
    lr_model = LinearRegression()
    accuracy, precision, recall, f1, conf_matrix = train_and_evaluate(df, lr_model)
    results.append({
    'season': season,
    'model': 'Linear Regression',
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'confusion_matrix': conf_matrix.tolist()
    })


# Convert results to DataFrame for display
results_df = pd.DataFrame(results)

# Display the results




    pts
0    10
1    38
2    22
3    18
4     5
..  ...
72   13
73   31
74   24
75   30
76   47

[77 rows x 1 columns]
0     38.0
1     22.0
2     18.0
3      5.0
4     22.0
      ... 
72    31.0
73    24.0
74    30.0
75    47.0
76    32.0
Name: y, Length: 77, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['pts'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['y']


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [None]:
display(results_df)

Unnamed: 0,season,accuracy,precision,recall,f1_score,confusion_matrix
0,2013-14,0.529412,0.473684,0.290323,0.36,"[[27, 10], [22, 9]]"
1,2014-15,0.657143,0.444444,0.173913,0.25,"[[42, 5], [19, 4]]"
2,2015-16,0.623188,0.615385,0.275862,0.380952,"[[35, 5], [21, 8]]"
3,2016-17,0.608696,0.2,0.041667,0.068966,"[[41, 4], [23, 1]]"
4,2017-18,0.585366,0.538462,0.388889,0.451613,"[[17, 6], [11, 7]]"
5,2018-19,0.59322,0.25,0.1,0.142857,"[[33, 6], [18, 2]]"
6,2019-20,,0.0,0.0,0.0,[]
7,2020-21,0.54717,0.461538,0.26087,0.333333,"[[23, 7], [17, 6]]"
8,2021-22,0.537037,0.470588,0.333333,0.390244,"[[21, 9], [16, 8]]"
9,2022-23,0.565217,0.333333,0.176471,0.230769,"[[23, 6], [14, 3]]"
