In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import brier_score_loss, log_loss, roc_auc_score
from matplotlib import pyplot as plt
import seaborn as sns

# some of our imports here.

Data extraction

In [None]:
path = 'march-machine-learning-mania-2025'
M_regular_results = pd.read_csv(f"{path}/MRegularSeasonDetailedResults.csv")
M_tourney_results = pd.read_csv(f"{path}/MNCAATourneyDetailedResults.csv")

W_regular_results = pd.read_csv(f"{path}/WRegularSeasonDetailedResults.csv")
W_tourney_results = pd.read_csv(f"{path}/WNCAATourneyDetailedResults.csv")

regular_results = pd.concat([M_regular_results, W_regular_results])
tourney_results = pd.concat([M_tourney_results, W_tourney_results])
w_seed = pd.read_csv('march-machine-learning-mania-2025/WNCAATourneySeeds.csv')
m_seed = pd.read_csv('march-machine-learning-mania-2025/MNCAATourneySeeds.csv')
seeds = pd.concat([m_seed, w_seed], axis=0).fillna(0.05)
submission_df = pd.read_csv('march-machine-learning-mania-2025/SampleSubmissionStage2.csv')
regular_results = regular_results.loc[regular_results["Season"] >= 2013]
tourney_results = tourney_results.loc[tourney_results["Season"] >= 2013]
seeds = seeds.loc[seeds["Season"] >= 2013]
seeds


Unnamed: 0,Season,Seed,TeamID
1810,2013,W01,1231
1811,2013,W02,1274
1812,2013,W03,1266
1813,2013,W04,1393
1814,2013,W05,1424
...,...,...,...
1739,2025,Z12,3193
1740,2025,Z13,3251
1741,2025,Z14,3195
1742,2025,Z15,3117


In [43]:
submission_df.head()


Unnamed: 0,ID,Pred
0,2025_1101_1102,0.5
1,2025_1101_1103,0.5
2,2025_1101_1104,0.5
3,2025_1101_1105,0.5
4,2025_1101_1106,0.5


In [44]:
def extract_game_info(id_str):
    # Extract year and team_ids
    parts = id_str.split('_')
    year = int(parts[0])
    teamID1 = int(parts[1])
    teamID2 = int(parts[2])
    return year, teamID1, teamID2

def extract_seed_value(seed_str):
    # Extract seed value
    try:
        return int(seed_str[1:])
    # Set seed to 16 for unselected teams and errors
    except ValueError:
        return 16

# Reformat the data
submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()
seeds['SeedValue'] = seeds['Seed'].apply(extract_seed_value)

# Merge seed information for TeamID1
submission_df = pd.merge(submission_df, seeds[['Season', 'TeamID', 'SeedValue']],
                         left_on=['Season', 'TeamID1'], right_on=['Season', 'TeamID'],
                         how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue1'}).drop(columns=['TeamID'])

# Merge seed information for TeamID2
submission_df = pd.merge(submission_df, seeds[['Season', 'TeamID', 'SeedValue']],
                         left_on=['Season', 'TeamID2'], right_on=['Season', 'TeamID'],
                         how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue2'}).drop(columns=['TeamID'])

In [45]:
# Calculate seed difference
submission_df['SeedDiff'] = submission_df['SeedValue1'] - submission_df['SeedValue2']

# Update 'Pred' column
submission_df['Pred'] = 0.5 + (0.03 * submission_df['SeedDiff'])

# Drop unnecessary columns
submission_df = submission_df[['ID', 'Pred']].fillna(0.5)

# Preview your submission
submission_df

Unnamed: 0,ID,Pred
0,2025_1101_1102,0.5
1,2025_1101_1103,0.5
2,2025_1101_1104,0.5
3,2025_1101_1105,0.5
4,2025_1101_1106,0.5
...,...,...
131402,2025_3477_3479,0.5
131403,2025_3477_3480,0.5
131404,2025_3478_3479,0.5
131405,2025_3478_3480,0.5


In [46]:
submission_df.to_csv(f"{path}/baselinesubmission.csv", index=False)