In [162]:
import pandas as pd
import numpy as np
from data_manager import DataManager

In [163]:
dm = DataManager()

def get_features(player_name):
    team_stats = dm.get_team_rolling_stats(player_name)
    if not team_stats:
        return None
    team_stats_df = pd.concat(team_stats, axis=0).reset_index(drop=True)
    player_id = dm.get_player_id(player_name)
    player_stats = dm.get_and_save_player_data(player_id)[::-1]
    player_stats['date'] = pd.to_datetime(player_stats['date'])
    team_stats_df['date'] = pd.to_datetime(team_stats_df['date'])
    merged = pd.merge(player_stats, team_stats_df, on='date', how='inner', suffixes=[None, "_team"])

    cols_of_interest = ['minutes', 'points', 'rebounds',
    'assists', 'efg', 'fg3a', 'fg3m', 'fg3_pct', 'fga', 'fgm', 'fta',
    'ft_pct', 'steals', 'blocks', 'date', 'efg_team',
    'fg3a_team', 'fg3_pct_team', 'fga_team', 'fta_team', 'ft_pct_team',
    'steals_team', 'blocks_team', 'to', 'pace', 'e_def_rating',
    'e_off_rating', 'opp_efg', 'opp_fg3a', 'opp_fg3_pct', 'opp_fga',
    'opp_fta', 'opp_ft_pct', 'opp_steals', 'opp_blocks', 'opp_to',
    'opp_pace', 'opp_e_def_rating', 'opp_e_off_rating']

    df = merged[cols_of_interest]
    return df

def get_features_corr_matrix(df, target_col):
    df[target_col] = df[target_col].astype(float)
    df.loc[:, target_col] = df[target_col].shift(1)
    df.replace("", np.nan, inplace=True)
    df.dropna(inplace=True)
    correlation_matrix = df.corr()
    return correlation_matrix[target_col]


def get_stat_corrs(player_name, stat):
    merged = get_features(player_name)
    if merged is None or merged.empty:
        return None
    corr = get_features_corr_matrix(merged, stat)
    return player_name, corr

In [164]:
player = "Sam Hauser"
data = get_features(player)

In [165]:
print(data)

    minutes  points  rebounds  assists    efg  fg3a  fg3m  fg3_pct  fga  fgm  \
0      21.0      12         4        2  1.000     6     4    0.667    6    4   
1      35.0       8         2        2  0.444     8     2    0.250    9    3   
2      22.0      11         3        3  0.786     6     3    0.500    7    4   
3      21.0      15         4        0  0.938     7     5    0.714    8    5   
4      37.0       3         1        1  0.167     9     1    0.111    9    1   
..      ...     ...       ...      ...    ...   ...   ...      ...  ...  ...   
78     14.0       2         2        0  0.500     1     0    0.000    2    1   
79      9.0       5         4        1  0.625     3     1    0.333    4    2   
80      9.0       0         1        0  0.000     2     0    0.000    2    0   
81     22.0       2         5        3  0.200     4     0    0.000    5    1   
82     13.0       0         1        0  0.000     3     0    0.000    3    0   

    ...  opp_fg3_pct  opp_fga  opp_fta 

In [166]:
data['points'] = data['points'].shift(-1)
data.replace("", np.nan, inplace=True)
data.dropna(inplace=True)
# data = data.tail(25)
print(data[['date', 'points']])

         date  points
0  2023-11-13     8.0
1  2023-11-15    11.0
2  2023-11-17    15.0
3  2023-11-19     3.0
4  2023-11-20    10.0
..        ...     ...
77 2024-05-11     2.0
78 2024-05-13     5.0
79 2024-05-15     0.0
80 2024-05-21     2.0
81 2024-05-23     0.0

[82 rows x 2 columns]


In [167]:

# Define the split ratio
train_size = 0.8

# Determine the split index
split_index = int(len(data) * train_size)

# Split the DataFrame
train_df = data.iloc[:split_index]
test_df = data.iloc[split_index:]

print("\nTraining DataFrame:")
print(train_df['date'])

print("\nTesting DataFrame:")
print(test_df['date'])



Training DataFrame:
0    2023-11-13
1    2023-11-15
2    2023-11-17
3    2023-11-19
4    2023-11-20
        ...    
60   2024-03-28
61   2024-03-30
62   2024-04-01
63   2024-04-03
64   2024-04-05
Name: date, Length: 65, dtype: datetime64[ns]

Testing DataFrame:
65   2024-04-07
66   2024-04-09
67   2024-04-11
68   2024-04-12
69   2024-04-14
70   2024-04-21
71   2024-04-24
72   2024-04-27
73   2024-04-29
74   2024-05-01
75   2024-05-07
76   2024-05-09
77   2024-05-11
78   2024-05-13
79   2024-05-15
80   2024-05-21
81   2024-05-23
Name: date, dtype: datetime64[ns]


In [168]:
y_train = train_df['points']
X_train = train_df.drop(['date', 'points'], axis=1)
y_test = test_df['points']
X_test = test_df.drop(['date', 'points'], axis=1)


In [169]:
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=5, alpha=10, n_estimators=100)

xg_reg.fit(X_train, y_train)

In [170]:
y_pred = xg_reg.predict(X_test)
print(y_pred)

[11.313813  11.237073   9.252188  12.004046  10.682685  11.406563
 10.215805   9.739797  10.8360195  8.326561  12.442875  11.9209585
 10.358662  13.431827  11.8719     9.433999   6.6624713]


In [171]:
y_pred = pd.Series(y_pred)
results = pd.concat([y_pred, y_test.reset_index(drop=True)], axis=1)
results.columns = ['y_pred', 'y_test']

In [172]:
results.to_csv('test.csv')
print(results)

       y_pred  y_test
0   11.313813     8.0
1   11.237073    15.0
2    9.252188    16.0
3   12.004046    16.0
4   10.682685    12.0
5   11.406563     6.0
6   10.215805     5.0
7    9.739797     0.0
8   10.836020    17.0
9    8.326561     0.0
10  12.442875     2.0
11  11.920959     6.0
12  10.358662     2.0
13  13.431827     5.0
14  11.871900     0.0
15   9.433999     2.0
16   6.662471     0.0
