In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as sk_metrics
import tempfile
import os
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

pd.options.display.max_rows = 100
# Preset matplotlib figure sizes.
matplotlib.rcParams['figure.figsize'] = [9, 6]

print(tf.__version__)
# To make the results reproducible, set the random seed value.
random_seed = 22
tf.random.set_seed(random_seed)

2024-12-07 23:31:33.192310: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-07 23:31:33.266192: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-07 23:31:33.616502: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-07 23:31:33.616549: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-07 23:31:33.680960: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

2.15.0


In [619]:
chosed_features = [
    'home_batting_onbase_plus_slugging_10RA', 'away_batting_onbase_plus_slugging_10RA', 
    'home_team_wins_mean', 'away_team_wins_mean', 
    'home_team_wins_skew', 'away_team_wins_skew',
    'home_batting_onbase_plus_slugging_mean', 'away_batting_onbase_plus_slugging_mean',
    'home_batting_onbase_plus_slugging_skew', 'away_batting_onbase_plus_slugging_skew', 
    'home_pitching_earned_run_avg_mean', 'away_pitching_earned_run_avg_mean', 
    'home_pitching_earned_run_avg_skew', 'away_pitching_earned_run_avg_skew', 
    'home_pitcher_earned_run_avg_10RA', 'away_pitcher_earned_run_avg_10RA',
    'home_batting_wpa_bat_mean', 'away_batting_wpa_bat_mean',
    'home_batting_wpa_bat_skew', 'away_batting_wpa_bat_skew',
    'home_batting_onbase_perc_mean', 'away_batting_onbase_perc_mean',
    'home_batting_onbase_perc_skew', 'away_batting_onbase_perc_skew',
    'home_pitching_H_batters_faced_10RA', 'away_pitching_H_batters_faced_10RA',
]

In [620]:
test_data_path = 'test_data_ver1.csv'
train_and_validation_data_path = 'train_data_ver3.csv'

dataset = pd.read_csv(train_and_validation_data_path)
#dataset = dataset.drop(['id','home_team_abbr','away_team_abbr','is_night_game','home_pitcher','away_pitcher','home_team_rest','away_team_rest','home_pitcher_rest','away_pitcher_rest','season'], axis=1)
dataset = dataset[chosed_features + ['home_team_win']]

In [621]:
def cross_validation(num_chunks: int):
    chunk_size = len(dataset) // num_chunks
    chunks = [dataset.iloc[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks)]
    # Handle any remaining rows (if the dataset isn't perfectly divisible)
    if len(dataset) % num_chunks != 0:
        chunks[-1] = pd.concat([chunks[-1], dataset.iloc[num_chunks * chunk_size:]])

    best_accuracy, best_loss, best_model = -1, -1, -1

    for v in range(num_chunks):
        validation_dataset = chunks[v]
        train_dataset = dataset.drop(chunks[v].index)

        X_train = train_dataset.drop('home_team_win', axis=1)
        y_train = train_dataset['home_team_win']
        X_validation = validation_dataset.drop('home_team_win', axis=1)
        y_validation = validation_dataset['home_team_win']

        poly = PolynomialFeatures(degree=3, include_bias=False)  # Degree 3 polynomial
        X_train_poly = poly.fit_transform(X_train)
        X_validation_poly = poly.transform(X_validation)

        # Perform logistic regression
        regularization_strength = 0.0001
        log_reg = LogisticRegression(C=regularization_strength, max_iter=10000, random_state=random_seed)
        log_reg.fit(X_train_poly, y_train)

        # Make predictions
        y_train_pred = log_reg.predict(X_train_poly)
        y_validation_pred = log_reg.predict(X_validation_poly)

        # Evaluate the model
        train_acc = accuracy_score(y_train, y_train_pred)
        validation_acc = accuracy_score(y_validation, y_validation_pred)

        print("Training Accuracy:", train_acc)
        print("Testing Accuracy:", validation_acc)

        if best_accuracy == -1 or validation_acc > best_accuracy:
            best_accuracy = validation_acc
            best_model = log_reg
            best_loss = log_reg.score(X_validation_poly, y_validation)

    return best_accuracy, best_loss, best_model

In [622]:
best_accuracy, best_loss, best_model = cross_validation(10)
print(best_accuracy)

Training Accuracy: 0.6327677943981528
Testing Accuracy: 0.5162748643761302
Training Accuracy: 0.6311615299668708
Testing Accuracy: 0.5379746835443038
Training Accuracy: 0.6317638791286015
Testing Accuracy: 0.5280289330922242
Training Accuracy: 0.6269450858347555
Testing Accuracy: 0.5497287522603979
Training Accuracy: 0.6315630960746913
Testing Accuracy: 0.5397830018083183
Training Accuracy: 0.6282501756851722
Testing Accuracy: 0.5415913200723327
Training Accuracy: 0.627346651942576
Testing Accuracy: 0.5488245931283906
Training Accuracy: 0.6283505672121273
Testing Accuracy: 0.566003616636528
Training Accuracy: 0.6297560485894991
Testing Accuracy: 0.5343580470162749
Training Accuracy: 0.6290938316254772
Testing Accuracy: 0.5292003593890386
0.566003616636528


In [623]:
test_data_path = 'test_data_ver1.csv'

test_dataset = pd.read_csv(test_data_path)
#dataset = dataset.drop(['id','home_team_abbr','away_team_abbr','is_night_game','home_pitcher','away_pitcher','home_team_rest','away_team_rest','home_pitcher_rest','away_pitcher_rest','season'], axis=1)
test_dataset = test_dataset[chosed_features]

X_test = test_dataset

poly = PolynomialFeatures(degree=3, include_bias=False)  # Degree 3 polynomial
X_test_poly = poly.fit_transform(X_test)
# Make predictions
y_pred = best_model.predict(X_test_poly)

results = pd.DataFrame({"home_team_win": [True if i == 1 else False for i in y_pred]})
results.to_csv("predictions_with_ids.csv", index=True)

