In [None]:
import pandas as pd
from scipy.stats import poisson

In [4]:
import pandas as pd
from models import Player, TradPlayerStats, AdvPlayerStats, Game
from sqlalchemy import and_
from data_manager import DataManager
from sqlalchemy.exc import SQLAlchemyError
from scipy.stats import poisson

# Initialize DataManager
dm = DataManager()
engine = dm.get_engine()
session = dm.get_session()

def get_and_save_player_data(player_id, save=False):
    try:
        data = session.query(
            Player,
            TradPlayerStats,
            AdvPlayerStats,
            Game
        ).join(Game, TradPlayerStats.game_id == Game.id)\
        .join(Player, TradPlayerStats.player_id == Player.id)\
        .join(AdvPlayerStats, and_(TradPlayerStats.game_id == AdvPlayerStats.game_id, 
                                    TradPlayerStats.player_id == AdvPlayerStats.player_id))\
        .filter(TradPlayerStats.player_id == player_id).all()

        # Debug: Print sample data to ensure correct structure
        if data:
            print("Sample data:", data[0])

        # Convert the query result to a DataFrame
        data_list = []
        if not data:
            return None
        for player, trad_stats, adv_stats, game in data:
            row = {
                'player_name': player.name,
                'player_position': player.position,
                'minutes': trad_stats.minutes,
                'points': trad_stats.pts,
                'rebounds': trad_stats.reb,
                'assists': trad_stats.ast,
                'efg': adv_stats.efg_pct,
                'fg3a': trad_stats.fg3a,
                'fg3m': trad_stats.fg3m,
                'fg3_pct': trad_stats.fg3_pct,
                'fga': trad_stats.fga,
                'fgm': trad_stats.fgm,
                'fta': trad_stats.fta,
                'ft_pct': trad_stats.ft_pct,
                'steals': trad_stats.stl,
                'blocks': trad_stats.blk,
                'date': game.date,
            }
            data_list.append(row)

        data_df = pd.DataFrame(data_list)
        
        # Ensure the 'date' column is in datetime format
        data_df['date'] = pd.to_datetime(data_df['date'])

        if save:
            data_df.to_csv(f"data_pile/{player_id}.csv", index=False)
        
        return data_df

    except SQLAlchemyError as e:
        session.rollback()
        print(f"An error occurred: {e}")
    finally:
        session.close()

def estimate_probability_poisson(data, stat, n):
    mean = data[stat].mean()
    probability = 1 - poisson.cdf(n, mean)
    return probability

def split_dataframe(df, fraction=0.8):
    """
    Splits the DataFrame into two parts: top fraction and bottom fraction.

    Parameters:
    df (pd.DataFrame): The DataFrame to be split.
    fraction (float): The fraction of the DataFrame to be included in the top part. Default is 0.8.

    Returns:
    tuple: A tuple containing two DataFrames: (top_df, bottom_df).
    """
    # Calculate the split index
    split_index = int(len(df) * fraction)

    # Split the DataFrame
    top_df = df[:split_index]
    bottom_df = df[split_index:]

    return top_df, bottom_df

def get_percentage_of_values_higher(series, number):
    total_values = len(series)
    values_greater_than = (series > number).sum()
    percentage_greater_than = (values_greater_than / total_values) * 100
    return percentage_greater_than

def test_poisson_prediction(player_name, stat, split_fraction, threshold, size_of_segments):
    try:
        session = dm.get_session()
        player = session.query(Player).filter(Player.name == player_name).first()
        if player:
            player_id = player.id
            df = get_and_save_player_data(player_id)
            if df is None:
                return None, None, None
        else:
            print(f"No player found with the name {player_name}")
            return None, None, None
        
    except SQLAlchemyError as e:
        session.rollback()
        print(f"An error occurred: {e}")
        return None, None, None
    finally:
        session.close()

    # Ensure the 'date' column is in datetime format
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Debug: Check if 'date' column exists
    if 'date' in df.columns:
        # Sort the DataFrame by 'date'
        df = df.sort_values(by='date', ascending=False)
    else:
        print("Date column is missing.")
        return None, None, None

    # Split the DataFrame into train and test sets
    train, test = split_dataframe(df, split_fraction)

    # Use the top and bottom parts of the train and test sets
    train = train.head(size_of_segments)
    test = test.tail(size_of_segments)
    
    # threshold = test[stat].median() + 3

    # Estimate probability using Poisson distribution
    probability = estimate_probability_poisson(train, stat, threshold)

    # Calculate the percentage of values higher than the threshold in the test set
    percent_correct = get_percentage_of_values_higher(test[stat], threshold)
    sample_size = len(test)
    return probability, percent_correct, sample_size

# Example usage
players = dm.query_players()
player_names = [player.name for player in players]
stat = "points"
split_fraction = 0.5
threshold = 19
size_of_segments = 25
results = []

for player_name in player_names:
    print(player_name)
    probability, percent_correct, sample_size = test_poisson_prediction(player_name, stat, split_fraction, threshold, size_of_segments)
    if probability is not None and percent_correct is not None:
        results.append((player_name, probability, percent_correct, sample_size))
        
        print(f"Estimated Probability: {probability:.4f}")
        print(f"Percentage Correct: {percent_correct:.2f}%")
    else:
        print(f"Skipping {player_name} due to missing data.")


Jrue Holiday
Sample data: (<models.Player object at 0x000001F6CF2A0A10>, <models.TradPlayerStats object at 0x000001F6CF2A09B0>, <models.AdvPlayerStats object at 0x000001F6CF2A0F20>, <models.Game object at 0x000001F6CF2A1DF0>)
Estimated Probability: 0.1134
Percentage Correct: 33.33%
Wesley Matthews
Sample data: (<models.Player object at 0x000001F6CDABA900>, <models.TradPlayerStats object at 0x000001F68DFB01A0>, <models.AdvPlayerStats object at 0x000001F68DFB0740>, <models.Game object at 0x000001F6CF41F560>)
Estimated Probability: 0.0994
Percentage Correct: 33.33%
Saddiq Bey
Sample data: (<models.Player object at 0x000001F6CF41D250>, <models.TradPlayerStats object at 0x000001F6CF499940>, <models.AdvPlayerStats object at 0x000001F6CF498620>, <models.Game object at 0x000001F6CF652150>)
Estimated Probability: 0.3018
Percentage Correct: 0.00%
Derrick White
Sample data: (<models.Player object at 0x000001F6CF57BDD0>, <models.TradPlayerStats object at 0x000001F6CF57AD20>, <models.AdvPlayerStats

In [9]:
results = pd.DataFrame.from_records(results)
print(results)
results['huh'] = results[2] - results[1]


                   0         1          2  3
0       Jrue Holiday  0.113367  33.333333  3
1    Wesley Matthews  0.099421  33.333333  3
2         Saddiq Bey  0.301769   0.000000  3
3      Derrick White  0.576236  33.333333  3
4      Jalen Johnson  0.409038  33.333333  3
..               ...       ...        ... ..
517         Tre Mann  0.997524  33.333333  3
518   Brandon Miller  0.318612  33.333333  3
519       Seth Curry  0.554320   0.000000  3
520  Dejounte Murray  0.893193   0.000000  3
521     Damian Jones  0.193207   0.000000  3

[522 rows x 4 columns]


  results = pd.DataFrame.from_records(results)


In [None]:
mean = train_df["points"].mean()
print(mean)


In [None]:
lambda_ = mean
n = 29
probability = 1 - poisson.cdf(n, lambda_)
print(probability)

In [None]:
success = val_df[val_df['points'] >= n]
failure = val_df[val_df['points'] < n]
print(len(success)/len(val_df))