In [1]:
import pandas as pd
from clean_data import clean_data

SOURCE_TYPE = 3

if SOURCE_TYPE == 1:
    test_data_path = 'data/2024_test_data.csv'
    train_and_validation_data_path = 'data/train_data.csv'
    std_out_threshold, std_dev_threshold = 0.2, 3
    train_and_validation_dataset = clean_data(train_and_validation_data_path, std_out_threshold, std_dev_threshold)
    test_dataset = clean_data(test_data_path, std_out_threshold, std_dev_threshold, delete_rows = False)
    
elif SOURCE_TYPE == 2:
    test_data_path = 'data/test_data_ver1.csv'
    train_and_validation_data_path = 'data/train_data_ver3.csv'
    train_and_validation_dataset = pd.read_csv(train_and_validation_data_path)
    test_dataset = pd.read_csv(test_data_path)

else:
    test_data_path = 'data/pruned_test_data.csv'
    train_and_validation_data_path = 'data/pruned_train_data.csv'
    train_and_validation_dataset = pd.read_csv(train_and_validation_data_path)
    test_dataset = pd.read_csv(test_data_path)

train_dataset = train_and_validation_dataset.sample(frac=0.85)
validation_dataset = train_and_validation_dataset.drop(train_dataset.index)

In [2]:
train_and_validation_dataset


Unnamed: 0,home_team_win,pitching_earned_run_avg_10RA_diff,pitching_SO_batters_faced_10RA_diff,pitcher_earned_run_avg_10RA_diff,pitcher_SO_batters_faced_10RA_diff,spread_mean_diff,batting_wpa_bat_mean_diff,pitching_earned_run_avg_mean_diff,pitching_SO_batters_faced_mean_diff,pitching_wpa_def_mean_diff,pitcher_earned_run_avg_mean_diff,pitcher_SO_batters_faced_mean_diff,pitcher_wpa_def_mean_diff
0,1,-0.400936,1.135311,-0.451786,-0.600618,0.232242,-0.017768,0.290098,-0.599530,-0.149244,0.147821,-0.615826,-0.436607
1,0,0.413540,-0.086135,0.591205,0.406982,-0.446507,-0.095579,0.873758,-1.108814,-0.647918,0.585652,0.418240,-0.774559
2,1,2.566319,0.332099,0.360122,0.520117,-0.945766,-0.765393,1.553253,-0.243544,-0.689751,0.360682,0.425323,-1.165884
3,1,0.149647,-0.133175,0.400020,-0.690280,-0.377971,0.303138,-0.184699,-0.701792,-0.514816,0.398023,-0.653129,-1.067826
4,0,0.391674,-0.698271,0.268371,-0.110808,0.153218,-0.780007,0.290276,1.269112,-0.509719,0.500372,0.111479,-0.538494
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11062,1,-0.732837,-0.301837,0.678235,-1.122570,-0.022919,-0.049515,0.946696,-1.579697,-0.783425,0.637842,-0.689791,-1.259165
11063,0,1.197505,-0.077005,1.102193,-0.798978,-1.475805,-0.918999,1.087993,-0.324736,-1.857501,1.094200,-0.821309,-2.277912
11064,1,-2.304107,0.969051,0.004379,-2.228084,-0.999681,-0.674214,1.147821,-0.018667,-0.635482,0.000800,-2.290331,-1.003008
11065,0,-0.044608,-0.542118,0.274104,0.260892,-0.807703,0.298269,0.179930,-0.468318,-0.017940,0.266375,0.266847,-0.518081


In [3]:
chosed_features = [
    'home_batting_onbase_plus_slugging_10RA', 'away_batting_onbase_plus_slugging_10RA', 
    'home_team_wins_mean', 'away_team_wins_mean', 
    'home_team_wins_skew', 'away_team_wins_skew',
    'home_batting_onbase_plus_slugging_mean', 'away_batting_onbase_plus_slugging_mean',
    'home_batting_onbase_plus_slugging_skew', 'away_batting_onbase_plus_slugging_skew', 
    'home_pitching_earned_run_avg_mean', 'away_pitching_earned_run_avg_mean', 
    'home_pitching_earned_run_avg_skew', 'away_pitching_earned_run_avg_skew', 
    'home_pitcher_earned_run_avg_10RA', 'away_pitcher_earned_run_avg_10RA',
    'home_batting_wpa_bat_mean', 'away_batting_wpa_bat_mean',
    'home_batting_wpa_bat_skew', 'away_batting_wpa_bat_skew',
    'home_batting_onbase_perc_mean', 'away_batting_onbase_perc_mean',
    'home_batting_onbase_perc_skew', 'away_batting_onbase_perc_skew',
    'home_pitching_H_batters_faced_10RA', 'away_pitching_H_batters_faced_10RA',
]

drop_features = [
    'home_team_rest', 'away_team_rest',
    'home_pitcher_rest','away_pitcher_rest',
    'is_night_game', 'home_team_errors_mean',
    'home_pitcher_BB_batters_faced_mean', 'away_pitcher_BB_batters_faced_mean',
]

TYPE_OF_CHOSE = 3
if TYPE_OF_CHOSE == 1:
    train_dataset = train_dataset[chosed_features + ['home_team_win']]
    validation_dataset = validation_dataset[chosed_features + ['home_team_win']]
    test_dataset = test_dataset[chosed_features]
elif TYPE_OF_CHOSE == 2:
    train_dataset = train_dataset.drop(drop_features, axis=1)
    validation_dataset = validation_dataset.drop(drop_features, axis=1)
    test_dataset = test_dataset.drop(drop_features, axis=1)

In [4]:
X_train = train_dataset.drop('home_team_win', axis=1)
y_train = train_dataset['home_team_win']
X_validation = validation_dataset.drop('home_team_win', axis=1)
y_validation = validation_dataset['home_team_win']

In [5]:
MODEL_TYPE = 1
"""
1: Logistic Regression
2: Random Forest
3: Gradient Boosting
4: XGBoost
5: LightGBM
6: CatBoost
7: SVR
8: KNN
9: Decision Tree
"""
BLEND_TYPE = 0
"""
0: single model
1: stacking
2: linear blending
3: any blending
4: adaboost
"""

'\n0: single model\n1: stacking\n2: linear blending\n3: any blending\n4: adaboost\n'

In [6]:
from models.logistic_regression import logistic_regression
from models.random_forest import random_forest
from models.gradient_boosting import gradient_boosting
from models.xgboost import xgboost
from models.lightgbm import lightgbm
from models.catboost import catboost
from models.svm import svm
from models.knn import knn
import models.xgb2
from models.decision_tree import decision_tree
from blending_models.stacking import stacking
from blending_models.linear_blending import linear_blending
from blending_models.any_blending import any_blending
from blending_models.adaboost import adaboost

In [7]:
if MODEL_TYPE == 1:
    accuracy, result = logistic_regression(X_train, y_train, X_validation, y_validation, test_dataset)
    print(f'logistic_regression: {accuracy}')
elif MODEL_TYPE == 2:
    accuracy, result = random_forest(X_train, y_train, X_validation, y_validation, test_dataset)
elif MODEL_TYPE == 3:
    accuracy, result = gradient_boosting(X_train, y_train, X_validation, y_validation, test_dataset)
elif MODEL_TYPE == 4:
    accuracy, result = xgboost(X_train, y_train, X_validation, y_validation, test_dataset)
elif MODEL_TYPE == 5:
    accuracy, result = lightgbm(X_train, y_train, X_validation, y_validation, test_dataset)
elif MODEL_TYPE == 6:
    accuracy, result = catboost(X_train, y_train, X_validation, y_validation, test_dataset)
elif MODEL_TYPE == 7:
    accuracy, result = svm(X_train, y_train, X_validation, y_validation, test_dataset)
elif MODEL_TYPE == 8:
    accuracy, result = knn(X_train, y_train, X_validation, y_validation, test_dataset)
elif MODEL_TYPE == 9:
    accuracy, result = decision_tree(X_train, y_train, X_validation, y_validation, test_dataset)
        

Fitting 3 folds for each of 12 candidates, totalling 36 fits
{'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}
logistic_regression: 0.5596385542168675


In [8]:
accuracy

0.5596385542168675

In [9]:
prediction = []
for i in result:
    if i > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [10]:


pred = pd.DataFrame({"home_team_win": result})
pred.to_csv("predictions_with_ids.csv", index=True, index_label='id')