In [21]:
import pandas as pd
import numpy as np 
import pyodbc
from slugify import slugify
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score,accuracy_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler

from utils import PyroBayesianLogisticRegression, KFoldCrossValidationBobby,send_email 
from sklearn.metrics import f1_score
from datetime import date

from sklearn.model_selection import ParameterGrid
import numpy as np

import numpy as np
import tensorflow as tf
import random
import os

# Set seed for reproducibility
seed_value = 42

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED'] = str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)



# Note: For TensorFlow 2.x, you may not need to set up a session as shown above. `tf.random.set_seed(seed_value)` is often sufficient.


In [22]:
def autoencode_features(data, num_features=10):
    # Preprocess data: Normalize features
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)

    # Dimensionality specifics
    input_dim = data_scaled.shape[1]
    encoding_dim = num_features

    # Building the autoencoder
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(input_layer, decoded)

    # Encoder model
    encoder = Model(input_layer, encoded)

    # Compile autoencoder
    autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')

    # Configure early stopping
    early_stopping = EarlyStopping(monitor='loss', min_delta=0.00001, patience=10, verbose=1, mode='auto')

    # Train the autoencoder
    autoencoder.fit(data_scaled, data_scaled, epochs=5000, batch_size=256, shuffle=True, verbose=0, callbacks=[early_stopping], use_multiprocessing=True)

    # Encode the data
    encoded_data = encoder.predict(data_scaled, verbose=False, use_multiprocessing=True)

    return encoded_data, encoder, scaler

In [23]:
df = pd.read_csv(r'C:\Users\bobby\Downloads\rw-sleeper-predictions-2024-03-19 (1).csv')

In [24]:
df['line_Type'] = ''
for i, row in df.iterrows():
    if row['Market Name'] == 'Rebounds':
        df.loc[i,'line_Type'] ='REB'
    elif row['Market Name'] == '3PT Made':
        df.loc[i,'line_Type'] ='FG3M'
    elif row['Market Name'] == 'Assists':
        df.loc[i,'line_Type'] ='AST'
    elif row['Market Name'] == 'Blocks':
        df.loc[i,'line_Type'] ='BLK'
    elif row['Market Name'] == 'PTS+REB+AST':
        df.loc[i,'line_Type'] ='PTS+REB+AST'
    elif row['Market Name'] == 'Turnovers':
        df.loc[i,'line_Type'] ='TOV'
    elif row['Market Name'] == 'Points':
        df.loc[i,'line_Type'] ='PTS'
    elif row['Market Name'] == 'Steals':
        df.loc[i,'line_Type'] ='STL'
        
df['Opponent'] = df['Opponent'].apply(lambda x: x.replace('@','').strip())

In [25]:
df['Date'] = pd.to_datetime(df['Date'])
# Slugify player names for both DataFrames
df['Player_Slug'] = df['Player'].apply(slugify)


In [26]:
server = 'localhost\SQLEXPRESS'
database = 'nba_game_data'

cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';')
cursor = cnxn.cursor()
sql = """
SELECT distinct
      pgl.[PLAYER_NAME]
      ,pgl.PLAYER_ID

  FROM [nba_game_data].[dbo].[PlayerGameLogs] pgl
  LEFT OUTER JOIN [nba_game_data].[dbo].[player_clustering] pc
   on
    cast(pgl.PLAYER_ID as int) = cast(pc.PLAYER_ID as int)
  where yearSeason = 2024
  """
players = pd.read_sql(sql,cnxn)


In [27]:
players['Player_Slug'] = players['PLAYER_NAME'].apply(slugify)
df = pd.merge(df, players, how='left', left_on = ['Player_Slug'],right_on=['Player_Slug'])

In [28]:
df.dropna(subset='PLAYER_ID', inplace=True)

In [29]:
df['Team']

0      BKN
1      BKN
2      BKN
3      ORL
4      SAS
      ... 
153    BKN
154    MIN
155    SAS
156    CHA
157    BKN
Name: Team, Length: 142, dtype: object

In [30]:
lines = df[['PLAYER_ID','line_Type','Line','Opponent','PLAYER_NAME','Team']]

In [31]:
lines['PLAYER_ID']=lines['PLAYER_ID'].astype(int)

In [32]:
lines

Unnamed: 0,PLAYER_ID,line_Type,Line,Opponent,PLAYER_NAME,Team
0,1627827,PTS+REB+AST,14.5,NOP,Dorian Finney-Smith,BKN
1,1627827,PTS,7.5,NOP,Dorian Finney-Smith,BKN
2,203471,PTS+REB+AST,22.5,NOP,Dennis Schroder,BKN
3,1630591,PTS+REB+AST,16.5,CHA,Jalen Suggs,ORL
4,1630200,REB,3.5,DAL,Tre Jones,SAS
...,...,...,...,...,...,...
153,1627827,BLK,0.5,NOP,Dorian Finney-Smith,BKN
154,1630162,AST,4.5,DEN,Anthony Edwards,MIN
155,1641705,TOV,3.5,DAL,Victor Wembanyama,SAS
156,1628970,PTS,20.5,ORL,Miles Bridges,CHA


In [33]:
def process_row(row, server, database):
    conn_str = 'DRIVER={SQL Server};SERVER=' + server + ';DATABASE=' + database + ';'
    with pyodbc.connect(conn_str) as conn:
        cursor = conn.cursor()
        query = "EXEC seasonPtsDataSet @pts_thresh = ?, @player_id = ?, @line_type = ?"
        params = (row['Line'], int(row['PLAYER_ID']), str(row['line_Type']))
        cursor.execute(query, params)
        results = cursor.fetchall()
        if results:
            temp_df = pd.DataFrame.from_records(results, columns=[column[0] for column in cursor.description])
            temp_df['GAME_DATE'] = pd.to_datetime(temp_df['GAME_DATE'])
            temp_df['LineType'] = str(row['line_Type'])
            # Filter where Date matches GAME_DATE
            return temp_df
        
    return pd.DataFrame()  # Return an empty DataFrame if there are no results or an error occurs

In [34]:
# Function to execute for each row in parallel
def process_row_pred(row, server, database):
    conn_str = 'DRIVER={SQL Server};SERVER=' + server + ';DATABASE=' + database + ';'
    with pyodbc.connect(conn_str) as conn:
        cursor = conn.cursor()
        query = "EXEC seasonPtsDataSetPred @pts_thresh = ?, @player_id = ?, @line_type = ?, @opp = ?"
        params = (row['Line'], int(row['PLAYER_ID']), str(row['line_Type']), str(row['Opponent']))
        cursor.execute(query, params)
        results = cursor.fetchall()
        if results:
            temp_df = pd.DataFrame.from_records(results, columns=[column[0] for column in cursor.description])
            temp_df['LineType'] = str(row['line_Type'])
            return temp_df
    return pd.DataFrame()  # Return an empty DataFrame if there are no results or an error occurs


In [None]:
# output = pd.DataFrame()
# for index,row in lines.iterrows():
#     try:
data = process_row(row,server,database)
data = data.sort_values(by='GAME_DATE').reset_index(drop=True).iloc[1:]

data = data.drop(columns=['PLAYER_ID','GAME_DATE','LineType'])
features = data.columns
for column in data.columns:
    # Check if the column is numeric, as median can only be calculated for numeric columns
    if pd.api.types.is_numeric_dtype(data[column]):
        # Calculate the median of the column, skipping NA values
        median_value = data[column].median(skipna=True)
        # Fill NA values in the column with the calculated median
        data[column] = data[column].fillna(median_value)
X = data.drop(columns='lineThresh')
X = X.fillna(-1)
y = data['lineThresh']
X, encoder, scaler = autoencode_features(X)

param_grid = {
    'n_steps': [200, 500, 100],  # Example values
    'lr': [0.01, 0.001, .1],  # Learning rates
    'batch_size': [250, 500, 1000]  # Batch sizes
}

# Initialize a list to store the results of each configuration
results = []

In [54]:
def prepare_labels(data):
    """
    Prepares labels for the Over and Under models.

    Parameters:
    - data: DataFrame containing the original labels under 'lineThresh'

    Returns:
    - y_over: Labels for the Over model
    - y_under: Labels for the Under model, which are simply the inverted labels of y_over
    """
    y_over = data['lineThresh'].copy()
    # Assuming binary classification where '1' is over and '0' is under,
    # for the under model, we invert the labels
    y_under = 1 - y_over
    return y_over, y_under

from sklearn.model_selection import ParameterGrid

# Assume PyroBayesianLogisticRegression and your cross-validation scheme are defined

def train_model(X, y, param_grid):
    """
    Trains the model using grid search and cross-validation.

    Parameters:
    - X: Features for training the model.
    - y: Labels for training the model.
    - param_grid: Dictionary defining the grid of parameters to search.

    Returns:
    - best_score: The best precision score obtained.
    - best_params: The parameters corresponding to the best score.
    - best_model: The trained model with the best parameters.
    """
    best_score = 0
    best_params = None
    best_model = None

    for params in ParameterGrid(param_grid):
        # Initialize the model with current parameters
        blr = PyroBayesianLogisticRegression(n_steps=n_steps, lr=lr, batch_size=batch_size)
        loocv = KFoldCrossValidationBobby(blr, n_jobs=10, n_splits=10)

            # Perform the cross-validation. Assuming you have a method `cross_validate` that returns the evaluation metric(s)
            # Note: This step is pseudo-code. You will need to implement or adjust the cross_validate method based on your setup.
        avg_precision = loocv.cross_validate(X, y)  # X and y should be your data and labels respectively

#         avg_precision = loocv.cross_validate(X, y)
        
        # Calculate average precision across folds
#         avg_precision = sum(cv_scores) / len(cv_scores)
        
        if avg_precision > best_score:
            best_score = avg_precision
            best_params = params
            # Retrain on the entire dataset with the best parameters (for simplicity, we re-use the variable 'model')
            best_model = PyroBayesianLogisticRegression(n_steps=params['n_steps'], lr=params['lr'], batch_size=params['batch_size'])
            best_model.fit(X, y)  # Assuming .fit() method exists and works as expected

    return best_score, best_params, best_model

def fill_na_with_median(data):
    for column in data.columns:
        if pd.api.types.is_numeric_dtype(data[column]):
            median_value = data[column].median(skipna=True)
            data[column] = data[column].fillna(median_value)
    return data


In [65]:
lines = lines.iloc[:5]

In [66]:
import pandas as pd
from sklearn.model_selection import ParameterGrid
# Assuming PyroBayesianLogisticRegression and other custom functions are defined elsewhere

output = pd.DataFrame()

for index, row in lines.iterrows():
    try:
        # Process row to prepare the data
        data = process_row(row, server, database)
        data = data.sort_values(by='GAME_DATE').reset_index(drop=True).iloc[1:]
        data = data.drop(columns=['PLAYER_ID', 'GAME_DATE', 'LineType'])

        # Fill NA values and prepare features and labels
        data = fill_na_with_median(data)  # Assume implementation of this function based on your NA handling
        y_over, y_under = prepare_labels(data)  # Assume you modify labels here for over and under

        X = data.drop(columns='lineThresh')
        X = X.fillna(-1)

        # Autoencode features (assuming this function exists and returns encoded features, an encoder, and a scaler)
        X_encoded, encoder, scaler = autoencode_features(X)

        # Define parameter grid (same as your original grid)
        param_grid = {
            'n_steps': [200, 500, 100],
            'lr': [0.01, 0.001, .1],
            'batch_size': [250, 500, 1000]
        }

        # Train Over Model
        over_model_results = train_model(X_encoded, y_over, param_grid)
        # Train Under Model (with swapped labels)
        under_model_results = train_model(X_encoded, y_under, param_grid)

        # Select the best model based on precision score
        # Assuming train_model returns (best_score, best_params, fitted_model)
        if over_model_results[0] > under_model_results[0]:  # Compare precision scores
            best_model = over_model_results[2]  # Use Over Model
            label = 'Over'
        else:
            best_model = under_model_results[2]  # Use Under Model
            label = 'Under'
        # Predict using the selected model
        pred = process_row_pred(row, server, database)
        pred = pred.fillna(-1)
        # data_scaled = scaler.transform(pred[features].drop(columns=['lineThresh']))
        data_scaled = scaler.transform(pred[features].drop(columns=['lineThresh']))
        pred = encoder.predict(data_scaled, verbose=False, use_multiprocessing=True)
        pred_proba = best_model.predict_proba(pred)

        # Prepare output DataFrame
        temp = pd.DataFrame([[row['PLAYER_NAME'], row['Team'], row['line_Type'], row['Line'], label, pred_proba, max(over_model_results[0], under_model_results[0])]], columns=['Player', 'Team', 'line_Type', 'Line', 'Prediction', 'Proba', 'Model Score'])
        output = pd.concat([output, temp])
        
    except Exception as e:
        print(row['PLAYER_NAME'], row['Line'], str(e))

# Helper functions like fill_na_with_median, prepare_labels, train_model, etc., are assumed to be defined elsewhere


Epoch 1714: early stopping
Epoch 2300: early stopping
Epoch 1959: early stopping
Epoch 1708: early stopping
Epoch 1998: early stopping


In [67]:
output = output.sort_values(by='Proba', ascending=False)
send_email(f'Daily Line Picks for {str(date.today())}(Picky Bayes(With Embedding))', 'bobby.plourde12@yahoo.com', output)

Unnamed: 0,Player,Team,line_Type,Line,Prediction,Proba,Model Score
0,Dorian Finney-Smith,BKN,PTS+REB+AST,14.5,Under,0.40123212,0.48
0,Dorian Finney-Smith,BKN,PTS,7.5,Over,0.71996397,0.643333
0,Dennis Schroder,BKN,PTS+REB+AST,22.5,Under,0.54812765,0.416667
0,Jalen Suggs,ORL,PTS+REB+AST,16.5,Over,0.9417015,0.613333
0,Tre Jones,SAS,REB,3.5,Under,0.0007588118,0.558571


In [42]:
output = pd.DataFrame()
for index,row in lines.iterrows():
    try:
        data = process_row(row,server,database)
        data = data.sort_values(by='GAME_DATE').reset_index(drop=True).iloc[1:]

        data = data.drop(columns=['PLAYER_ID','GAME_DATE','LineType'])
        features = data.columns
        for column in data.columns:
            # Check if the column is numeric, as median can only be calculated for numeric columns
            if pd.api.types.is_numeric_dtype(data[column]):
                # Calculate the median of the column, skipping NA values
                median_value = data[column].median(skipna=True)
                # Fill NA values in the column with the calculated median
                data[column] = data[column].fillna(median_value)
        X = data.drop(columns='lineThresh')
        X = X.fillna(-1)
        y = data['lineThresh']
        X, encoder, scaler = autoencode_features(X)

        param_grid = {
            'n_steps': [200, 500, 100],  # Example values
            'lr': [0.01, 0.001, .1],  # Learning rates
            'batch_size': [250, 500, 1000]  # Batch sizes
        }

        # Initialize a list to store the results of each configuration
        results = []

        # Iterate over every combination in the parameter grid
        for params in ParameterGrid(param_grid):
            # Unpack parameters
            n_steps = params['n_steps']
            lr = params['lr']
            batch_size = params['batch_size']

            # Initialize your model and cross-validation scheme with the current set of parameters
            blr = PyroBayesianLogisticRegression(n_steps=n_steps, lr=lr, batch_size=batch_size)
            loocv = KFoldCrossValidationBobby(blr, n_jobs=10, n_splits=10)

            # Perform the cross-validation. Assuming you have a method `cross_validate` that returns the evaluation metric(s)
            # Note: This step is pseudo-code. You will need to implement or adjust the cross_validate method based on your setup.
            score = loocv.cross_validate(X, y)  # X and y should be your data and labels respectively

            # Store the results
            results.append((score, params))

        # Find the best parameter set based on the evaluation metric. Assuming higher score is better.
        best_score, best_params = max(results, key=lambda x: x[0])

        batch_size = best_params['batch_size']
        lr = best_params['lr']
        n_steps = best_params['n_steps']

        blr = PyroBayesianLogisticRegression(n_steps=n_steps, lr=lr, batch_size=batch_size)
        blr.fit(X,y)

        pred = process_row_pred(row,server,database)
        data_scaled = scaler.transform(pred[features].drop(columns=['lineThresh']))
        pred = encoder.predict(data_scaled, verbose=False, use_multiprocessing=True)
        pred_proba = blr.predict_proba(pred)

        temp = pd.DataFrame([[row['PLAYER_NAME'],row['Team'],row['line_Type'],row['Line'],pred_proba,best_score]],columns=['Player','Team','line_Type','Line','Proba','Model Score']) 

        output = pd.concat([output,temp])
    except:
        print(row['PLAYER_NAME'],row['Line'])
    

Epoch 2228: early stopping


In [41]:
output = output.sort_values(by='Proba', ascending=False)


In [None]:
from datetime import date


In [None]:
send_email(f'Daily Line Picks for {str(date.today())}(Picky Bayes(With Embedding))', 'bobby.plourde12@yahoo.com', output)

In [173]:
pred_labels

1

In [172]:
for threshold in thresholds:
        # Convert predicted probabilities to binary predictions based on the current threshold
        pred_labels = (pred_proba >= threshold).astype(int)

        # Calculate the F1 score for the current threshold
        score = f1_score(y, pred_labels)

        # Update the best score and best threshold if the current score is better
        if score > best_score:
            best_score = score
            best_threshold = threshold

TypeError: Singleton array 1 cannot be considered a valid collection.

In [152]:
print('model predicts:',pred_proba,'model score is:',best_score)


model predicts: 0.55525535 model score is: 0.6463492063492065


In [136]:
data = process_row(row,server,database)

In [137]:
data = data.sort_values(by='GAME_DATE').reset_index(drop=True).iloc[1:]


In [138]:
data = data.drop(columns=['PLAYER_ID','GAME_DATE','LineType'])
features = data.columns
for column in data.columns:
    # Check if the column is numeric, as median can only be calculated for numeric columns
    if pd.api.types.is_numeric_dtype(data[column]):
        # Calculate the median of the column, skipping NA values
        median_value = data[column].median(skipna=True)
        # Fill NA values in the column with the calculated median
        data[column] = data[column].fillna(median_value)

In [139]:
X = data.drop(columns='lineThresh')
y = data['lineThresh']
X, encoder, scaler = autoencode_features(X)

In [140]:
batch_size = 500
lr = 0.001 
n_steps = 200

In [147]:
blr = PyroBayesianLogisticRegression(n_steps=n_steps, lr=lr, batch_size=batch_size)
loocv = KFoldCrossValidationBobby(blr, n_jobs=-1, n_splits=15)
score = loocv.cross_validate(X, y)

In [148]:
blr.fit(X,y)

In [149]:
score

0.6485714285714288

In [150]:
pred = process_row_pred(row,server,database)
data_scaled = scaler.transform(pred[features].drop(columns=['lineThresh']))
pred = encoder.predict(data_scaled, verbose=False, use_multiprocessing=True)
pred_proba = blr.predict_proba(pred)
pred_proba

array(0.52450347, dtype=float32)

In [133]:
# Define your parameter grid as a dictionary. Adjust the values according to your needs.
param_grid = {
    'n_steps': [200, 500, 100],  # Example values
    'lr': [0.01, 0.001, .1],  # Learning rates
    'batch_size': [250, 500, 1000]  # Batch sizes
}

# Initialize a list to store the results of each configuration
results = []

# Iterate over every combination in the parameter grid
for params in ParameterGrid(param_grid):
    # Unpack parameters
    n_steps = params['n_steps']
    lr = params['lr']
    batch_size = params['batch_size']
    
    # Initialize your model and cross-validation scheme with the current set of parameters
    blr = PyroBayesianLogisticRegression(n_steps=n_steps, lr=lr, batch_size=batch_size)
    loocv = KFoldCrossValidationBobby(blr, n_jobs=-1, n_splits=15)
    
    # Perform the cross-validation. Assuming you have a method `cross_validate` that returns the evaluation metric(s)
    # Note: This step is pseudo-code. You will need to implement or adjust the cross_validate method based on your setup.
    score = loocv.cross_validate(X, y)  # X and y should be your data and labels respectively
    
    # Store the results
    print(score)
    results.append((score, params))

# Find the best parameter set based on the evaluation metric. Assuming higher score is better.
best_score, best_params = max(results, key=lambda x: x[0])

print(f"Best Score: {best_score}")
print(f"Best Parameters: {best_params}")

0.5463492063492065
0.3714285714285715
0.5574603174603175
0.5492063492063493
0.6485714285714288
0.4720634920634922
0.41365079365079366
0.4552380952380952
0.4425396825396825
0.5647619047619048
0.4285714285714286
0.42380952380952375
0.6485714285714288
0.6485714285714288
0.5285714285714287
0.4676190476190476
0.3707936507936508
0.32317460317460317
0.48698412698412696
0.43809523809523815
0.4536507936507937
0.6485714285714288
0.6485714285714288
0.4714285714285714
0.3536507936507937
0.40476190476190477
0.4904761904761905
Best Score: 0.6485714285714288
Best Parameters: {'batch_size': 250, 'lr': 0.001, 'n_steps': 500}


In [134]:
sorted(results, key=lambda x: x[0], reverse=True)

[(0.6485714285714288, {'batch_size': 250, 'lr': 0.001, 'n_steps': 500}),
 (0.6485714285714288, {'batch_size': 500, 'lr': 0.001, 'n_steps': 200}),
 (0.6485714285714288, {'batch_size': 500, 'lr': 0.001, 'n_steps': 500}),
 (0.6485714285714288, {'batch_size': 1000, 'lr': 0.001, 'n_steps': 200}),
 (0.6485714285714288, {'batch_size': 1000, 'lr': 0.001, 'n_steps': 500}),
 (0.5647619047619048, {'batch_size': 500, 'lr': 0.01, 'n_steps': 200}),
 (0.5574603174603175, {'batch_size': 250, 'lr': 0.01, 'n_steps': 100}),
 (0.5492063492063493, {'batch_size': 250, 'lr': 0.001, 'n_steps': 200}),
 (0.5463492063492065, {'batch_size': 250, 'lr': 0.01, 'n_steps': 200}),
 (0.5285714285714287, {'batch_size': 500, 'lr': 0.001, 'n_steps': 100}),
 (0.4904761904761905, {'batch_size': 1000, 'lr': 0.1, 'n_steps': 100}),
 (0.48698412698412696, {'batch_size': 1000, 'lr': 0.01, 'n_steps': 200}),
 (0.4720634920634922, {'batch_size': 250, 'lr': 0.001, 'n_steps': 100}),
 (0.4714285714285714, {'batch_size': 1000, 'lr': 0.0

In [132]:
sorted(results, key=lambda x: x[0], reverse=True)

[(0.6060441880101323, {'batch_size': 250, 'lr': 0.01, 'n_steps': 200}),
 (0.596236969234681, {'batch_size': 1000, 'lr': 0.01, 'n_steps': 200}),
 (0.5772311212814646, {'batch_size': 250, 'lr': 0.01, 'n_steps': 100}),
 (0.5643775213434656, {'batch_size': 500, 'lr': 0.01, 'n_steps': 200}),
 (0.5380434782608696, {'batch_size': 500, 'lr': 0.01, 'n_steps': 500}),
 (0.5332590115198811, {'batch_size': 500, 'lr': 0.1, 'n_steps': 200}),
 (0.5289855072463768, {'batch_size': 500, 'lr': 0.1, 'n_steps': 500}),
 (0.5113087395696092, {'batch_size': 250, 'lr': 0.1, 'n_steps': 100}),
 (0.5050088990592423, {'batch_size': 500, 'lr': 0.01, 'n_steps': 100}),
 (0.5050088990592423, {'batch_size': 500, 'lr': 0.001, 'n_steps': 200}),
 (0.4956521739130435, {'batch_size': 1000, 'lr': 0.1, 'n_steps': 100}),
 (0.43834223239257564, {'batch_size': 250, 'lr': 0.001, 'n_steps': 100}),
 (0.43834223239257564, {'batch_size': 500, 'lr': 0.001, 'n_steps': 100}),
 (0.43834223239257564, {'batch_size': 1000, 'lr': 0.001, 'n_st

In [None]:
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed
import numpy as np

# Define your parameter grid
param_grid = {
    'n_steps': [200, 500, 100],  # Example values
    'lr': [0.01, 0.001, 0.0001],  # Learning rates
    'batch_size': [len(X), 100, 250]  # Batch sizes
}

# Define a function to train and evaluate a single model configuration
def evaluate_model(params, X, y):
    n_steps = params['n_steps']
    lr = params['lr']
    batch_size = params['batch_size']
    
    # Initialize your model with the current set of parameters
    blr = PyroBayesianLogisticRegression(n_steps=n_steps, lr=lr, batch_size=batch_size)
    loocv = KFoldCrossValidationBobby(blr, n_jobs=2, n_splits=4)
    
    # Perform cross-validation
    # Note: This is pseudo-code. You will need to implement or adjust the cross_validate method.
    score = loocv.cross_validate(X, y)  # X and y should be your data and labels respectively
    
    return score, params

# Use joblib to parallelize the grid search
# n_jobs=-1 uses all available CPU cores. Adjust based on your system's capability or desired usage.
results = Parallel(n_jobs=-1,verbose=10)(
    delayed(evaluate_model)(params, X, y) for params in ParameterGrid(param_grid)
)

# Find the best parameter set based on the evaluation metric
best_score, best_params = max(results, key=lambda x: x[0])

print(f"Best Score: {best_score}")
print(f"Best Parameters: {best_params}")


In [59]:
process_row_pred(lines.iloc[0],server,database)

Unnamed: 0,PLAYER_ID,GAME_DATE,lineThresh,daysOfRest,secondHalfOfBackToBack,PlayerGotLastMeeting,NumberTimesHitAgaisntOppLast2,NumberTimesHitAgaisntOppLast5,PerGameAgainstOppLast2,PerGameAgainstOppLast5,...,DefRatingShortVsMedium,MPGRecentVsOffRating,FantPtsVsTeamNetRating,OffRatingVsNetRatingLong,OffRatingMediumVsDefRating,DefRatingVsTeamORating,FantPtsLongVsAvgPtsAllowedRecent,FantPtsVsDefRating,OffRatingVsPTSOffTOTrend,LineType
0,1627827,2024-03-16,1,2,0,13,1,3,13.33,16.0,...,12623.9631,2859.9377,29.498,-35.8645,13246.6698,13224.7191,2183.27,2434.2171,102.47,PTS+REB+AST
