In [None]:
# feature columns
X = baseline_pm_scaled.drop('PLUS_MINUS', axis=1)

# target column
y = baseline_pm_scaled['PLUS_MINUS']

In [None]:
# initialize LOO expanding window
n_splits = len(X) - 1 
tscv = TimeSeriesSplit(n_splits=n_splits)

# initialize model
glm = LinearRegression()

# storage for predictions and RMSE
predictions = []
rmse_scores = []

start_time = time.time()

# expanding window
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # fit model on training data and predict on test data
    glm.fit(X_train, y_train)
    prediction = glm.predict(X_test)
    predictions.append(prediction[0])
    
    # evaluate model
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    rmse_scores.append(rmse)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time:.2f} seconds")

In [None]:
start_time = time.time()

# configuration for expanding window
initial_train_size = 100  # starting size of the training set
test_size = 1             # leave-one-out (LOO) cross-validation
target_col = 'TOTAL_PTS'  # target column name
df = baseline_pts_scaled  # data set to use

# storage for predictions and RMSE
predictions = []
rmse_scores = []

for train_indices, test_indices in utl.expanding_window_ts_split(
    df, initial_train_size, test_size=test_size):

    # get training and testing data for this window
    X_train = df.iloc[train_indices].drop(columns=target_col)
    y_train = df.iloc[train_indices][target_col]
    X_test = df.iloc[test_indices].drop(columns=target_col)
    y_test = df.iloc[test_indices][target_col]
    
    # fit model on training data and predict on test data
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    prediction = lin_reg.predict(X_test)
    predictions.extend(prediction)
    
    # get RMSE
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    rmse_scores.append(rmse)
    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time:.2f} seconds")

In [None]:
# get average RMSE over all splits
average_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {average_rmse:.2f}")

In [None]:
start_time = time.time()

# configuration for expanding window
initial_train_size = 100  # starting size of the training set
test_size = 1             # leave-one-out (LOO) cross-validation
target_col = 'PLUS_MINUS' # target column name
df = baseline_pm_scaled   # data set to use

# storage for predictions and RMSE
predictions = []
rmse_scores = []

for train_indices, test_indices in utl.expanding_window_ts_split(
    df, initial_train_size, test_size=test_size):

    # get training and testing data for this window
    X_train = df.iloc[train_indices].drop(columns=target_col)
    y_train = df.iloc[train_indices][target_col]
    X_test = df.iloc[test_indices].drop(columns=target_col)
    y_test = df.iloc[test_indices][target_col]

    # fit model on training data and predict on test data
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    prediction = lin_reg.predict(X_test)
    predictions.extend(prediction)
    
    # get RMSE
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    rmse_scores.append(rmse)
    
end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time:.2f} seconds")

In [None]:
# get average RMSE over all splits
average_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {average_rmse:.2f}")

In [None]:
start_time = time.time()

# configuration for expanding window
initial_train_size = 30  # starting size of the training set (to ensure some diversity of class labels)
test_size = 1  # leave-one-out (LOO) cross-validation
target_col = 'GAME_RESULT'  # target column name
df = baseline_res_scaled # data set to use

# storage for predictions and true labels
prob_predictions = []
y_true = []

# iterate over expanding window splits
for train_indices, test_indices in utl.expanding_window_ts_split(
    baseline_res_scaled, initial_train_size, test_size, ensure_diversity=True, target_col=target_col):

    # get training and testing data for this window
    X_train = df.iloc[train_indices].drop(columns=target_col)
    X_test = df.iloc[test_indices].drop(columns=target_col)
    y_train = df.iloc[train_indices][target_col]
    y_test = df.iloc[test_indices][target_col]

    # train the logistic regression model
    log_reg = LogisticRegression(max_iter=1000, solver='liblinear')
    log_reg.fit(X_train, y_train)

    # predict the probability for the class of interest (assuming class 1 is the positive class)
    proba = log_reg.predict_proba(X_test)[:, 1] # probability of class 1
    prob_predictions.extend(proba)

    # collect true labels for metrics calculations
    y_true.extend(y_test)

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

In [None]:
# calculate overall metrics

threshold = 0.5  # threshold for classifying probabilities into binary predictions
pred_labels = [1 if p > threshold else 0 for p in prob_predictions]
average_accuracy = accuracy_score(y_true, pred_labels)
overall_auc = roc_auc_score(y_true, prob_predictions)
average_f1_score = f1_score(y_true, pred_labels)

print(f"Average Accuracy: {average_accuracy:.2f}")
print(f"Overall AUC: {overall_auc:.2f}")
print(f"Average F1 Score: {average_f1_score:.2f}")

# OLD ROllING

In [None]:
start_time = time.time()

# configuration for rolling window
train_size = 500  # training window size
test_size = 1     # leave-one-out (LOO) cross-validation
target_col = 'TOTAL_PTS'  # target column name
df = baseline_pts_scaled # data set to use

# storage for predictions and RMSE scores
predictions = []
rmse_scores = []

# rolling window
for train_index, test_index in utl.rolling_window_ts_split(df, train_size, test_size):
    
    # get training and testing data for this window
    X_train = df.iloc[train_index].drop(columns=target_col)
    y_train = df.iloc[train_index][target_col]
    X_test = df.iloc[test_index].drop(columns=target_col)
    y_test = df.iloc[test_index][target_col]

    # train the linear regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    # predict the next value and store the prediction
    prediction = lin_reg.predict(X_test)
    predictions.extend(prediction)

    # evaluate model for this prediction
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    rmse_scores.append(rmse)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time:.2f} seconds")

In [None]:
# calculate average RMSE over all predictions
average_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {average_rmse:.2f}")

In [None]:
start_time = time.time()

# configuration for rolling window
train_size = 500  # training window size
test_size = 1     # leave-one-out (LOO) cross-validation
target_col = 'PLUS_MINUS'  # target column name
df = baseline_pm_scaled # data set to use

# storage for predictions and RMSE scores
predictions = []
rmse_scores = []

# rolling window
for train_index, test_index in utl.rolling_window_ts_split(df, train_size, test_size):
    
    # get training and testing data for this window
    X_train = df.iloc[train_index].drop(columns=target_col)
    y_train = df.iloc[train_index][target_col]
    X_test = df.iloc[test_index].drop(columns=target_col)
    y_test = df.iloc[test_index][target_col]

    # train the linear regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    # predict the next value and store the prediction
    prediction = lin_reg.predict(X_test)
    predictions.extend(prediction)

    # evaluate model for this prediction
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    rmse_scores.append(rmse)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time:.2f} seconds")

In [None]:
# calculate average RMSE over all predictions
average_rmse = np.mean(rmse_scores)
print(f"Average RMSE: {average_rmse:.2f}")

In [None]:
start_time = time.time()

# configuration for rolling window
train_size = 500  # training window size
test_size = 1     # leave-one-out (LOO) cross-validation
target_col = 'GAME_RESULT'  # target column name
df = baseline_res_scaled # data set to use

# storage for predictions and true labels
prob_predictions = []
y_true = []

# modify the call to rolling_window_ts_split to ensure class diversity for logistic regression
for train_index, test_index in utl.rolling_window_ts_split(
    df, train_size, test_size, ensure_diversity=True, target_col=target_col):

    # get training and testing data for this window
    X_train = df.iloc[train_index].drop(columns=target_col)
    y_train = df.iloc[train_index][target_col]
    X_test = df.iloc[test_index].drop(columns=target_col)
    y_test = df.iloc[test_index][target_col]

    # train the logistic regression model
    log_reg = LogisticRegression(max_iter=1000, solver='liblinear')
    log_reg.fit(X_train, y_train)

    # predict the probability for the class of interest (assuming class 1 is the positive class)
    proba = log_reg.predict_proba(X_test)[:, 1]  # probability of class 1
    prob_predictions.extend(proba)

    # collect true labels for metrics calculations
    y_true.extend(y_test)

end_time = time.time()
total_time = end_time - start_time
print(f"Total time taken: {total_time:.2f} seconds")

In [None]:
# calculate overall metrics

threshold = 0.5  # threshold for classifying probabilities into binary predictions
pred_labels = [1 if p > threshold else 0 for p in prob_predictions]
average_accuracy = accuracy_score(y_true, pred_labels)
overall_auc = roc_auc_score(y_true, prob_predictions)
average_f1_score = f1_score(y_true, pred_labels)

print(f"Average Accuracy: {average_accuracy:.2f}")
print(f"Overall AUC: {overall_auc:.2f}")
print(f"Average F1 Score: {average_f1_score:.2f}")