In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as sk_metrics
import tempfile
import os
pd.options.display.max_rows = 100
# Preset matplotlib figure sizes.
matplotlib.rcParams['figure.figsize'] = [9, 6]

print(tf.__version__)
# To make the results reproducible, set the random seed value.
random_seed = 22
tf.random.set_seed(random_seed)

2024-11-30 12:55:21.467309: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-30 12:55:21.476497: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-30 12:55:21.575180: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-30 12:55:21.575291: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-30 12:55:21.587711: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

2.15.0


In [2]:
test_data_path = 'test_data_ver1.csv'
train_and_validation_data_path = 'train_data_ver3.csv'

dataset = pd.read_csv(train_and_validation_data_path)
dataset = dataset.drop(['id','home_team_abbr','away_team_abbr','is_night_game','home_pitcher','away_pitcher','home_team_rest','away_team_rest','home_pitcher_rest','away_pitcher_rest','season', 'home_batting_leverage_index_avg_skew', 'away_batting_leverage_index_avg_skew', 'home_pitcher_SO_batters_faced_skew', 'away_pitcher_SO_batters_faced_skew'], axis=1)
#dataset = dataset[['home_team_win', 'home_batting_onbase_plus_slugging_10RA', 'away_batting_onbase_plus_slugging_10RA', 'home_team_wins_mean', 'away_team_wins_mean', 'away_pitching_SO_batters_faced_10RA', 'away_pitching_SO_batters_faced_10RA', 'home_batting_onbase_plus_slugging_mean', 'away_batting_onbase_plus_slugging_mean']]

In [3]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Define the model
model = LogisticRegression()

# Set up RFE to select the top 2 features
rfe = RFE(estimator=model, n_features_to_select=10)
X_train = dataset.drop('home_team_win', axis=1)
Y_train = dataset['home_team_win']
model = LogisticRegression()

# Initialize RFE with Logistic Regression model and choose number of features to select
rfe = RFE(estimator=model, n_features_to_select=1)

# Fit RFE on the training data
rfe.fit(X_train, Y_train)

# Get the ranking of features (1 = most important, higher = less important)
feature_ranking = rfe.ranking_

# Create a DataFrame to show features with their rankings
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Ranking': feature_ranking
})

# Sort the features by importance (lowest rank = most important)
feature_importance_df = feature_importance_df.sort_values(by='Ranking')

# Print the features from most important to least important
print("Feature Importance Ranking (Most to Least Important):")
print(feature_importance_df)
# Check feature rankings (1 = most important, higher = less important)
#print("Feature Rankings:", rfe.ranking_)


Feature Importance Ranking (Most to Least Important):
                                    Feature  Ranking
43                      away_team_wins_skew        1
15      away_pitching_SO_batters_faced_10RA        2
50   home_batting_onbase_plus_slugging_mean        3
44            home_batting_batting_avg_mean        4
128               home_pitcher_wpa_def_mean        5
..                                      ...      ...
65             away_batting_onbase_perc_std      144
51    home_batting_onbase_plus_slugging_std      145
83      home_pitching_SO_batters_faced_skew      146
39                       home_team_wins_std      147
104      away_pitching_H_batters_faced_skew      148

[148 rows x 2 columns]


In [76]:
class LogisticRegressionModel(tf.keras.Model):
    def __init__(self):
        super(LogisticRegressionModel, self).__init__()
        self.dense = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        return self.dense(inputs)
  

In [77]:
def cross_validation(num_chunks: int):
    chunk_size = len(dataset) // num_chunks
    chunks = [dataset.iloc[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks)]
    # Handle any remaining rows (if the dataset isn't perfectly divisible)
    if len(dataset) % num_chunks != 0:
        chunks[-1] = pd.concat([chunks[-1], dataset.iloc[num_chunks * chunk_size:]])

    best_accuracy, best_loss, best_model = -1, -1, -1

    for v in range(num_chunks):
        validation_dataset = chunks[v]
        train_dataset = dataset.drop(chunks[v].index)

        # Split the features and the target values
        x_train = train_dataset.drop('home_team_win', axis=1)
        y_train = train_dataset['home_team_win']
        x_validation = validation_dataset.drop('home_team_win', axis=1)
        y_validation = validation_dataset['home_team_win']

        # Convert the features and the target values to TensorFlow tensors
        x_train = tf.constant(x_train, dtype=tf.float64)
        y_train = tf.constant(y_train, dtype=tf.float64)
        x_validation = tf.constant(x_validation, dtype=tf.float64)
        y_validation = tf.constant(y_validation, dtype=tf.float64)

        # Instantiate the model
        model = LogisticRegressionModel()

        # Compile the model
        model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
                    loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=['accuracy'])
        
        model.fit(x_train, y_train, epochs=5, batch_size=1)
        loss, accuracy = model.evaluate(x_validation, y_validation)
        print(f"Loss: {loss}, Accuracy: {accuracy}")

        if best_accuracy == -1 or accuracy > best_accuracy:
            best_accuracy = accuracy
            best_loss = loss
            best_model = model

    return best_accuracy, best_loss, best_model



In [None]:
best_accuracy, best_loss, best_model = cross_validation(5)
print(f"Best accuracy: {best_accuracy}, Best loss: {best_loss}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.6890743970870972, Accuracy: 0.553999125957489
Epoch 1/5
Epoch 2/5
Epoch 3/5