In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/html2024-fall-final-project-stage-2/README.md
/kaggle/input/html2024-fall-final-project-stage-2/train_data.csv
/kaggle/input/html2024-fall-final-project-stage-2/2024_test_data.csv
/kaggle/input/html2024-fall-final-project-stage-2/2024_sample_submission.csv


In [28]:
# fundamental tool
import numpy as np
import pandas as pd

# model and data processing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [29]:
# read training data
train_data = pd.read_csv('/kaggle/input/html2024-fall-final-project-stage-2/train_data.csv')

# read testing data
test_data = pd.read_csv('/kaggle/input/html2024-fall-final-project-stage-2/2024_test_data.csv')

# read testing sample
sample_submission = pd.read_csv('/kaggle/input/html2024-fall-final-project-stage-2/2024_sample_submission.csv')

In [30]:
print("Training Data Shape:", train_data.shape)
print("Test Data Shape:", test_data.shape)
print("Sample Submission Shape:", sample_submission.shape)

Training Data Shape: (11067, 167)
Test Data Shape: (2428, 165)
Sample Submission Shape: (2428, 2)


In [31]:
print("Training Data Columns:\n", train_data.dtypes)
print("Test Data Columns:\n", test_data.dtypes)

Training Data Columns:
 id                                        int64
home_team_abbr                           object
away_team_abbr                           object
date                                     object
is_night_game                            object
                                         ...   
away_pitcher_leverage_index_avg_std     float64
away_pitcher_leverage_index_avg_skew    float64
away_pitcher_wpa_def_mean               float64
away_pitcher_wpa_def_std                float64
away_pitcher_wpa_def_skew               float64
Length: 167, dtype: object
Test Data Columns:
 id                                        int64
home_team_abbr                           object
away_team_abbr                           object
is_night_game                            object
home_pitcher                             object
                                         ...   
away_pitcher_leverage_index_avg_std     float64
away_pitcher_leverage_index_avg_skew    float64
away_pitcher_wpa_

In [32]:
print("Missing Values in Train Data:\n", train_data.isnull().sum())
print("Missing Values in Test Data:\n", test_data.isnull().sum())

Missing Values in Train Data:
 id                                         0
home_team_abbr                             0
away_team_abbr                             0
date                                       0
is_night_game                            553
                                        ... 
away_pitcher_leverage_index_avg_std     1974
away_pitcher_leverage_index_avg_skew    2646
away_pitcher_wpa_def_mean               1296
away_pitcher_wpa_def_std                1960
away_pitcher_wpa_def_skew               2647
Length: 167, dtype: int64
Missing Values in Test Data:
 id                                        0
home_team_abbr                            0
away_team_abbr                            0
is_night_game                           121
home_pitcher                            121
                                       ... 
away_pitcher_leverage_index_avg_std     368
away_pitcher_leverage_index_avg_skew    503
away_pitcher_wpa_def_mean               252
away_pitcher_wpa_def_s

In [33]:
columns_to_drop = ['id', 'date', 'home_pitcher', 'away_pitcher','home_team_season','season','away_team_season']

# drop some column
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

print("Train Data Shape after Dropping Columns:", train_data.shape)
print("Test Data Shape after Dropping Columns:", test_data.shape)

Train Data Shape after Dropping Columns: (11067, 160)
Test Data Shape after Dropping Columns: (2428, 159)


In [34]:
# 1. missing rate checking
train_missing = train_data.isnull().sum()
test_missing = test_data.isnull().sum()

# 2. delete the column of high missing rate（>50%）
high_missing_columns = train_missing[train_missing > 0.5 * len(train_data)].index
train_data.drop(columns=high_missing_columns, inplace=True)
test_data.drop(columns=high_missing_columns, inplace=True)

# 3. fill numerical columns
numerical_columns = train_data.select_dtypes(include=['number']).columns
train_data[numerical_columns] = train_data[numerical_columns].fillna(train_data[numerical_columns].mean())
test_data[numerical_columns] = test_data[numerical_columns].fillna(train_data[numerical_columns].mean())

# 4. fill objective columns
categorical_columns = train_data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    if col in test_data.columns:
        test_data[col].fillna(train_data[col].mode()[0], inplace=True)

print("Remaining missing values in training data:", train_data.isnull().sum().sum())
print("Remaining missing values in testing data:", test_data.isnull().sum().sum())


Remaining missing values in training data: 0
Remaining missing values in testing data: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna(train_data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(train_data[col].mode()[0], inplace=True)
  train_data[col].fillna(train_data[col].mode()[0], inplace=True)
  test_data[col].fillna(train_data[col]

In [35]:
# One-Hot Encoding 
categorical_columns = train_data.select_dtypes(include=['object']).columns
train_data = pd.get_dummies(train_data, columns=categorical_columns, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)

In [36]:
# create rest_day feature
train_data['rest_days_diff'] = train_data['home_team_rest'] - train_data['away_team_rest']
test_data['rest_days_diff'] = test_data['home_team_rest'] - test_data['away_team_rest']

# create batting_avg_diff feature
train_data['batting_avg_diff'] = train_data['home_batting_batting_avg_10RA'] - train_data['away_batting_batting_avg_10RA']
test_data['batting_avg_diff'] = test_data['home_batting_batting_avg_10RA'] - test_data['away_batting_batting_avg_10RA']

In [37]:
# take out th label
X = train_data.drop(columns=['home_team_win'])  # 'home_team_win' 是目標
y = train_data['home_team_win']

# watch the shape
print("Feature Data Shape:", X.shape)
print("Target Data Shape:", y.shape)

Feature Data Shape: (11067, 217)
Target Data Shape: (11067,)


In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# First Random Forest Model used to test the best max_depth
rf_model = RandomForestClassifier(
    n_estimators=1750,  # number of tree
    max_depth=20,    # max_depth
    random_state=11    # random seedd
)


In [39]:
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')

# 打印每一折的評估結果和平均準確率
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())
rf_model.fit(X, y)

y_pred_test = rf_model.predict(test_data)
print("Test Predictions (first 10):", y_pred_test[:10])

sample_submission['home_team_win'] = y_pred_test  # 替換目標列的值
sample_submission.to_csv('rf2_1750_10.csv', index=False)  # 保存為 CSV 文件

print("Submission file saved as 'rf2_1750_10.csv'.")

Cross-validation scores: [0.54426378 0.5501355  0.53637596 0.54631722 0.54225034]
Mean cross-validation score: 0.5438685585831607
Test Predictions (first 10): [ True  True  True False False  True False  True False False]
Submission file saved as 'rf2_1750_10.csv'.
