# Finalizing a Prediction Strategy

In previous notebooks, we explored different data preprocessing techniques, feature engineering, and testing various sklearn models for predicting NBA games.

Now, we will attempt to finalize a prediction strategy based on what we've learned.

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics

## Read the dataset

In [None]:
path_to_data = "../../data/processed/processed_team_dataset.csv"

df = pd.read_csv(path_to_data, index_col=0)
df

In [None]:
# Check that we have no null values

# Finding the number of missing values in each column
missing_values = df.isnull().sum()

# Printing the number of missing values in each column
print(sum(missing_values))

## Separate relevant features from the labels

In [None]:
# drop these columns (most of the are non_numeric or are not useful for machine learning)
drop_columns = ["date", "season", "team", "team_opp", "won"]

selected_columns = df.columns[~df.columns.isin(drop_columns)]

# Selected features
features_df = df[selected_columns]
features_df

In [None]:
# Label we want to predict
label = df["won"]
label

### Remember that elo is about 64.48% accurate at predicting winner of a game

In [None]:
projected_point_spread = df["team_point_diff_proj"]
projected_win_from_elo = projected_point_spread > 0

print(metrics.accuracy_score(label, projected_win_from_elo))

## Scale data for Logistic Regression

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_features_df = pd.DataFrame(scaler.fit_transform(features_df), columns=features_df.columns)
scaled_features_df

## Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

test_ratio = 0.3
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(scaled_features_df, label, test_size = 0.3) # 70% data is training and 30% is for testing

x_train_scaled

In [None]:
from sklearn.linear_model import LogisticRegression 

# Logistic Regression with max_iter=200 
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)
log_model.fit(x_train_scaled, y_train_scaled)
y_pred_log = log_model.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred_log))

From our previous tests, we found that logistic regression gave around 65% accuracy. Let us try to find the optimal parameters.

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'C': [0.001, 0.01, 0.1, 0.5, 0.75, 1, 1.25, 1.5, 2, 5, 10, 100],
#     'penalty': ['l1', 'l2', 'elasticnet', 'none'],
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# }

# log_model = LogisticRegression(max_iter=1000)
# clf = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy', verbose=3)

In [None]:
# clf.fit(x_train_scaled, y_train_scaled)

In [None]:
# print("Best Parameters:", clf.best_params_)

### Train using best parameters

We have found "optimal" parameters with GridSearchCV:

Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

Let's evaluate the model using these parameters.

In [None]:
opt_log_model = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')
opt_log_model.fit(x_train_scaled, y_train_scaled)
y_pred = opt_log_model.predict(x_test_scaled)
print(metrics.accuracy_score(y_test_scaled, y_pred))

It seems like the parameters do not give much of a performance bump. Nonetheless, let's use this "optimized" model further.

### See accuracy of "high" confidence predictions

In [None]:
# Predict with a probability for each class
y_pred_prob_log = opt_log_model.predict_proba(x_test_scaled)
y_pred_prob_log

In [None]:
prob_threshold = 0.7

# Identify predictions where the probability of either class 0 or class 1 meets or exceeds the threshold
high_confidence_indices = np.where((y_pred_prob_log[:, 0] >= prob_threshold) | (y_pred_prob_log[:, 1] >= prob_threshold))

high_confidence_indices = list(high_confidence_indices[0])
print(len(high_confidence_indices))

In [None]:
# Extract the predictions and actual labels for these high confidence predictions
high_conf_predictions = np.argmax(y_pred_prob_log[high_confidence_indices], axis=1)
high_conf_actual = y_test_scaled.reset_index(drop=True)
high_conf_actual = high_conf_actual[high_confidence_indices]
high_conf_actual

In [None]:
print(metrics.accuracy_score(high_conf_actual, high_conf_predictions))

### See accuracy of "low" confidence predictions

In [None]:
## What about the accuracy of those outside of the "high_conf_predictions"?

low_confidence_indices = [i for i in range(len(y_pred_prob_log)) if i not in high_confidence_indices]
print(len(low_confidence_indices))
assert len(low_confidence_indices) + len(high_confidence_indices) == len(y_pred_prob_log)

In [None]:
# Extract the predictions and actual labels for these "low" confidence predictions
low_conf_predictions = np.argmax(y_pred_prob_log[low_confidence_indices], axis=1)
low_conf_actual = y_test_scaled.reset_index(drop=True)
low_conf_actual = low_conf_actual[low_confidence_indices]
low_conf_actual

In [None]:
print(metrics.accuracy_score(low_conf_actual, low_conf_predictions))

In [None]:
x_test_low_confidence = x_test_scaled.reset_index(drop=True)
x_test_low_confidence = x_test_low_confidence.loc[low_confidence_indices]
assert(x_test_low_confidence.index.values.tolist() == low_conf_actual.index.values.tolist())
x_test_low_confidence

In [None]:
elo_proj_low_confidence = x_test_low_confidence["team_point_diff_proj"] > 0
elo_proj_low_confidence

In [None]:
print(metrics.accuracy_score(low_conf_actual, elo_proj_low_confidence))

## K-fold Cross Validation

Let's see if our log model is good for future data. What we can try to do is a [k-fold cross validation](https://www.youtube.com/watch?v=kituDjzXwfE&t=698s).

In [None]:
# Set X and Y as the data and labels
X = scaled_features_df
Y = label

In [None]:
# Set up the model again
opt_log_model_cv = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=20, shuffle=True, random_state=42)

# Performing k-fold cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(opt_log_model, X, Y, cv=kf, scoring="accuracy")

# Prining CV scores
print("CV scores:", cv_scores)

# Averaging CV scores
import numpy as np
print("Average score:", np.round(cv_scores.mean(), 2))

In [None]:
# What if we did just Elo?

In [None]:
scaled_features_df

In [None]:
elo_features = ["team_elo_before", "team_opp_elo_before", "team_expected_win_probability", "team_point_diff_proj"]
non_elo_features = scaled_features_df.columns[~scaled_features_df.columns.isin(elo_features)]

In [None]:
# Set X and Y as the data and labels
X = scaled_features_df[non_elo_features]
Y = label

In [None]:
X

In [None]:
# Set up the model again
opt_log_model_cv = LogisticRegression(max_iter=1000, verbose=2, random_state=42, C=0.01, penalty='l2', solver='liblinear')
log_model = LogisticRegression(max_iter=200, verbose=2, random_state=42)

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=20, shuffle=True, random_state=42)

# Performing k-fold cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(opt_log_model, X, Y, cv=kf, scoring="accuracy")

# Prining CV scores
print("CV scores:", cv_scores)

# Averaging CV scores
import numpy as np
print("Average score:", np.round(cv_scores.mean(), 2))