In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, TargetEncoder  # type: ignore
from sklearn.model_selection import KFold
from itertools import product

from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

### LGBM

#### Done and submitted: 
1. Every July is held out for test
2. dropped 'date' column
3. target encoding for categorical features
4. label encoding for 'is_night_game' since it is binary
5. boosting models works well
6. choosed LGBM because it is faster

#### Tried but failed:
1. SVC seems to fail (best score around 0.52)
2. ensembles of RF, SVC, CAT, LGBM together does not help ?
3. train with emphasis on later months(by simply duplicated them) does not help

In [4]:
# Load data
file_path = "data/task1/train_data.csv"
data = pd.read_csv(file_path)
data.columns = data.columns.str.strip()
data['date'] = pd.to_datetime(data['date'], errors='coerce')

TARGET = 'home_team_win'

train_data = data[data['date'].dt.month != 7]  
hold_out = data[data['date'].dt.month == 7]  

X_hold_out = hold_out.drop(columns=[TARGET])
y_hold_out = hold_out[TARGET]

X_train = train_data.drop(columns=[TARGET])
y_train = train_data[TARGET]

# transformation function
def transform(X_tr, y_tr, X_vl, remove_date=True, fill_na=True):
    X_train, y_train, X_val = X_tr.copy(), y_tr.copy(), X_vl.copy()

    X_train.drop(columns=['home_team_season', 'away_team_season'], inplace=True)
    X_val.drop(columns=['home_team_season', 'away_team_season'], inplace=True)

    if remove_date:
        if 'date' in X_train.columns:
            X_train.drop(columns=['date'], inplace=True)
        if 'date' in X_val.columns:
            X_val.drop(columns=['date'], inplace=True)

    # get categorical and numerical columns
    cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    cat_cols.remove('is_night_game')
    num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
    for i in ['id', 'season']: num_cols.remove(i)

    label_encoder = LabelEncoder()
    X_train['is_night_game'] = label_encoder.fit_transform(X_train['is_night_game'])
    X_val['is_night_game'] = label_encoder.transform(X_val['is_night_game'])
    
    encoder = TargetEncoder(random_state=42)
    X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols], y_train)
    X_val[cat_cols] = encoder.transform(X_val[cat_cols])

    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_val[num_cols] = scaler.transform(X_val[num_cols])

    if fill_na:
        X_train.fillna(X_train.median(), inplace=True)
        X_val.fillna(X_val.median(), inplace=True)

    return X_train, y_train, X_val

"""cross_validation performs cross validation after feature transformation and returns the average accuracy
make_iter gives all combinations of hyperparameters"""


def cross_validation(model, params, X, y, num_folds=5):
    kf_scores = []
    for train, val in KFold(n_splits=num_folds, shuffle=True).split(X):
        X_train, y_train = X.iloc[train], y.iloc[train]
        X_val, y_val = X.iloc[val], y.iloc[val]

        X_train, y_train, X_val = transform(X_train, y_train, X_val, remove_date=True, fill_na=True)

        model_ = model(**params)
        model_.fit(X_train, y_train)
        preds = model_.predict(X_val)
        kf_scores.append(np.mean(preds == y_val))
    return np.mean(kf_scores)

make_iter = lambda p: [dict(zip(p.keys(), combination)) for combination in product(*p.values())]

### Cross Validation

In [5]:
params_lgbm = {
    'n_estimators': [70, 75, 80],
    'max_depth': [8, 9, 10],
    'learning_rate': [0.037, 0.04, 0.043],
    'num_leaves': [15, 17, 20],
    'verbose': [-1]
}

num_folds = 5
result = []

for param in make_iter(params_lgbm):
    kf_scores = []
    for train, val in KFold(n_splits=num_folds, shuffle=True).split(train_data):
        sub_train_data = train_data.iloc[train]  
        val_data = train_data.iloc[val]

        X_train, y_train = sub_train_data.drop(columns=[TARGET]), sub_train_data[TARGET]
        X_val, y_val = val_data.drop(columns=[TARGET]), val_data[TARGET]

        X_train_t, y_train, X_val_t = transform(X_train, y_train, X_val, remove_date=True, fill_na=True)

        model = LGBMClassifier(**param)
        score = model.fit(X_train_t, y_train)
        preds = model.predict(X_val_t)
        kf_scores.append(np.mean(preds == y_val))

    print(f'{param}:', np.mean(kf_scores))
    result.append((np.mean(kf_scores), param))

{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.037, 'num_leaves': 15, 'verbose': -1}: 0.5474330698497948
{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.037, 'num_leaves': 17, 'verbose': -1}: 0.5507517277954088
{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.037, 'num_leaves': 20, 'verbose': -1}: 0.5583313691323001
{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.04, 'num_leaves': 15, 'verbose': -1}: 0.547672351625657
{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.04, 'num_leaves': 17, 'verbose': -1}: 0.5528851307175787
{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.04, 'num_leaves': 20, 'verbose': -1}: 0.5486202189242352
{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.043, 'num_leaves': 15, 'verbose': -1}: 0.5525272252293205
{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.043, 'num_leaves': 17, 'verbose': -1}: 0.5498052634975685
{'n_estimators': 70, 'max_depth': 8, 'learning_rate': 0.043, 'num_leaves': 20, 'verbose': -1

KeyboardInterrupt: 

In [None]:
result.sort(key=lambda x: x[0], reverse=True)
for i in range(15):
    print(result[i])

### Best params:

{'n_estimators': 75, 'max_depth': 8, 'learning_rate': 0.04, 'num_leaves': 15, 'verbose': -1}

**Result**

Validation accuracy 0.5586 \
Test accuracy 0.5671


In [6]:
# best param
val_score_params = [
    (0.5585682658069079, {'n_estimators': 75, 'max_depth': 8, 'learning_rate': 0.037, 'num_leaves': 17, 'verbose': -1}),
    ]

for val_score, params in val_score_params:
    lgbm = LGBMClassifier(**params)
    X_train, y_train = train_data.drop(columns=[TARGET]), train_data[TARGET]
    X_train_t, y_train_t, X_hold_out_t = transform(X_train, y_train, X_hold_out, remove_date=True, fill_na=True)
    lgbm.fit(X_train_t, y_train_t)
    preds = lgbm.predict(X_hold_out_t)
    print(f'Validation accuracy', val_score)
    print(f'Test accuracy', np.mean(preds == y_hold_out))
    print('\n')

Validation accuracy 0.5585682658069079
Test accuracy 0.5670731707317073




### Try First Submission

In [8]:
# Load data
test_file_path = "data/task1/same_season_test_data.csv"
test_data = pd.read_csv(test_file_path)
test_data.columns = test_data.columns.str.strip()

X_test = test_data

X_train_final = data.drop(columns=[TARGET])
y_train_final = data[TARGET]

X_train_final_t, y_train_final_t, X_test_t = transform(X_train_final, y_train_final, X_test, remove_date=True, fill_na=True)

lgbm = LGBMClassifier(n_estimators=75, max_depth=8, learning_rate=0.037, num_leaves=17, verbose=-1)
lgbm.fit(X_train_final_t, y_train_final_t)
preds = lgbm.predict(X_test_t)
sub_df = pd.DataFrame({'id': test_data['id'], 'home_team_win': preds})
# sub_df.to_csv('submission.csv', index=False)