In [2]:
import pandas as pd
from models import Player, TradPlayerStats, AdvPlayerStats, Game
from sqlalchemy import and_
from data_manager import DataManager
from sqlalchemy.exc import SQLAlchemyError
from scipy.stats import poisson

# Initialize DataManager
dm = DataManager()
engine = dm.get_engine()
session = dm.get_session()

def get_and_save_player_data(player_id, save=False):
    try:
        data = session.query(
            Player,
            TradPlayerStats,
            AdvPlayerStats,
            Game
        ).join(Game, TradPlayerStats.game_id == Game.id)\
        .join(Player, TradPlayerStats.player_id == Player.id)\
        .join(AdvPlayerStats, and_(TradPlayerStats.game_id == AdvPlayerStats.game_id, 
                                    TradPlayerStats.player_id == AdvPlayerStats.player_id))\
        .filter(TradPlayerStats.player_id == player_id).all()

        # Convert the query result to a DataFrame
        data_list = []
        for player, trad_stats, adv_stats, game in data:
            row = {
                'player_name': player.name,
                'player_position': player.position,
                'minutes': trad_stats.minutes,
                'points': trad_stats.pts,
                'rebounds': trad_stats.reb,
                'assists': trad_stats.ast,
                'efg': adv_stats.efg_pct,
                'fg3a': trad_stats.fg3a,
                'fg3m': trad_stats.fg3m,
                'fg3_pct': trad_stats.fg3_pct,
                'fga': trad_stats.fga,
                'fgm': trad_stats.fgm,
                'fta': trad_stats.fta,
                'ft_pct': trad_stats.ft_pct,
                'steals': trad_stats.stl,
                'blocks': trad_stats.blk,
                'date': game.date,
            }
            data_list.append(row)

        data_df = pd.DataFrame(data_list)
        
        # Ensure the 'date' column is in datetime format
        # data_df['date'] = pd.to_datetime(data_df['date'])

        if save:
            data_df.to_csv(f"data_pile/{player_id}.csv", index=False)
        
        return data_df

    except SQLAlchemyError as e:
        session.rollback()
        print(f"An error occurred: {e}")
    finally:
        session.close()

def estimate_probability_poisson(data, stat, n):
    mean = data[stat].mean()
    probability = 1 - poisson.cdf(n, mean)
    return probability

def split_dataframe(df, fraction=0.8):
    """
    Splits the DataFrame into two parts: top fraction and bottom fraction.

    Parameters:
    df (pd.DataFrame): The DataFrame to be split.
    fraction (float): The fraction of the DataFrame to be included in the top part. Default is 0.8.

    Returns:
    tuple: A tuple containing two DataFrames: (top_df, bottom_df).
    """
    # Calculate the split index
    split_index = int(len(df) * fraction)

    # Split the DataFrame
    top_df = df[:split_index]
    bottom_df = df[split_index:]

    return top_df, bottom_df

def get_percentage_of_values_higher(series, number):
    total_values = len(series)
    values_greater_than = (series > number).sum()
    percentage_greater_than = (values_greater_than / total_values) * 100
    return percentage_greater_than

def test_poisson_prediction(player_name, stat, split_fraction, threshold, size_of_segments):
    try:
        session = dm.get_session()
        player = session.query(Player).filter(Player.name == player_name).first()
        if player:
            player_id = player.id
            df = get_and_save_player_data(player_id)
        else:
            print(f"No player found with the name {player_name}")

    except SQLAlchemyError as e:
        session.rollback()
        print(f"An error occurred: {e}")
    finally:
        session.close()

    # Ensure the 'date' column is in datetime format
    # df['date'] = pd.to_datetime(df['date'])

    # Sort the DataFrame by 'date'
    df = df.sort_values(by='date', ascending=False)
    # Split the DataFrame into train and test sets
    train, test = split_dataframe(df, split_fraction)

    # Use the top and bottom parts of the train and test sets
    train = train.head(size_of_segments)
    test = test.tail(size_of_segments)

    # Estimate probability using Poisson distribution
    probability = estimate_probability_poisson(train, stat, threshold)

    # Calculate the percentage of values higher than the threshold in the test set
    percent_correct = get_percentage_of_values_higher(test[stat], threshold)

    # Print the train and test sets for verification
    # print("Train Set:")
    # print(train)
    # print("\nTest Set:")
    # print(test)

    return probability, percent_correct



In [3]:
# Example usage
players = dm.query_players()
player_names = [player.name for player in players]
stat = "points"
split_fraction = 0.5
threshold = 19
size_of_segments = 25
results = {}
for player_name in player_names:
    probability, percent_correct = test_poisson_prediction(player_name, stat, split_fraction, threshold, size_of_segments)
    results[player_name] = [probability, percent_correct]
    print(player_name)
    print(f"Estimated Probability: {probability:.4f}")
    print(f"Percentage Correct: {percent_correct:.2f}%")


Jrue Holiday
Estimated Probability: 0.0059
Percentage Correct: 4.00%
Wesley Matthews
Estimated Probability: 0.0000
Percentage Correct: 0.00%
Saddiq Bey
Estimated Probability: 0.1057
Percentage Correct: 4.00%
Derrick White
Estimated Probability: 0.2197
Percentage Correct: 24.00%
Jalen Johnson
Estimated Probability: 0.2413
Percentage Correct: 20.00%
Oshae Brissett
Estimated Probability: 0.0000
Percentage Correct: 0.00%
Seth Lundy
Estimated Probability: 0.0000
Percentage Correct: 0.00%
Kobe Bufkin
Estimated Probability: 0.0000
Percentage Correct: 0.00%
Xavier Tillman
Estimated Probability: 0.0000
Percentage Correct: 0.00%
De'Andre Hunter
Estimated Probability: 0.2078
Percentage Correct: 24.00%
Keldon Johnson
Estimated Probability: 0.0733
Percentage Correct: 52.00%
Ty Jerome
Estimated Probability: 0.0000
Percentage Correct: 0.00%
Caris LeVert
Estimated Probability: 0.0427
Percentage Correct: 32.00%
Jayson Tatum
Estimated Probability: 0.8979
Percentage Correct: 92.00%
Georges Niang
Estimate

KeyError: 'date'

In [None]:
player_name = "LeBron James"
prob, percent_correct = test_poisson_prediction(player_name, 0.5, 25)
print(prob, percent_correct)

In [None]:
df = df.sort_values(by="date", ascending = False)

In [None]:
train, test = split_dataframe(df, 0.5)

In [None]:
train = train.head(25)
probability = estimate_probability_poisson(train, "points", 25)
print(probability)



In [None]:


print(get_percentage_of_values_higher(test['points'].tail(25), 25))
print(len(test))
