In [1]:
# 确保安装正确的库
!pip install --upgrade scikit-learn autogluon ray==2.10.0 ipywidgets

# 检查版本
import sklearn
print("scikit-learn version:", sklearn.__version__)

# 导入库
import glob
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, brier_score_loss, mean_squared_error, roc_curve, auc
from sklearn.ensemble import RandomForestRegressor
from autogluon.tabular import TabularPredictor
!pip install --upgrade autogluon
import warnings
warnings.filterwarnings("ignore")
print("Successfully import")


class TournamentPredictor:
    def __init__(self, data_dir):
        self.data_path = data_dir
        self.train = None
        self.data = None
        self.teams = None
        self.seeds = None
        self.games = None
        self.efficiency1 = None
        self.efficiency2 = None
        self.rate1 = None
        self.rate2 = None
        self.situation = None
        self.rank = None
        self.final = None
        self.sub = None
        self.gb = None
        self.test = None
        self.col = None
        self.tourney_games = None
        self.model1 = None # declare model here.
        self.model2 = None
        self.calibration_model = None # declare calibration model here.
        self.imputer = SimpleImputer(strategy='mean')
        self.scaler = StandardScaler()

    def load_data(self):
        files = glob.glob(self.data_path)
        self.data = {p.split('/')[-1].split('.')[0]: pd.read_csv(p, encoding='latin-1') for p in files}

        max_ = self.data['MMasseyOrdinals'][self.data['MMasseyOrdinals']['SystemName'] == "POM"]['RankingDayNum'].max()
        _2025 = self.data['MMasseyOrdinals'][(self.data['MMasseyOrdinals']['SystemName'] == "POM") & (self.data['MMasseyOrdinals']['Season'] == 2025)]['RankingDayNum'].max()
        __2025 = self.data['MMasseyOrdinals'][(self.data['MMasseyOrdinals']['SystemName'] == "POM") & (self.data['MMasseyOrdinals']['Season'] == 2025)][self.data['MMasseyOrdinals']['RankingDayNum'] == _2025]
        self.data['MMasseyOrdinals'] = self.data['MMasseyOrdinals'][self.data['MMasseyOrdinals']['SystemName'] == "POM"][self.data['MMasseyOrdinals']['RankingDayNum'] == max_]
        self.data['MMasseyOrdinals'] = self.data['MMasseyOrdinals'][['Season','TeamID','OrdinalRank']]
        __2025 = __2025[['Season','TeamID','OrdinalRank']]
        self.data['MMasseyOrdinals'] = pd.concat([self.data['MMasseyOrdinals'],__2025],axis = 0)
        self.rank = self.data['MMasseyOrdinals']

        season_cresults = pd.concat([self.data['MRegularSeasonCompactResults'], self.data['WRegularSeasonCompactResults']])
        season_dresults = pd.concat([self.data['MRegularSeasonDetailedResults'], self.data['WRegularSeasonDetailedResults']])
        tourney_cresults = pd.concat([self.data['MNCAATourneyCompactResults'], self.data['WNCAATourneyCompactResults']])
        tourney_dresults = pd.concat([self.data['MNCAATourneyDetailedResults'], self.data['WNCAATourneyDetailedResults']])
        season_cresults['ST'] = 'S'
        season_dresults['ST'] = 'S'
        tourney_cresults['ST'] = 'T'
        tourney_dresults['ST'] = 'T'
        

        self.games = pd.concat((season_dresults, tourney_dresults), axis=0, ignore_index=True)
        self.games['WLoc'] = self.games['WLoc'].map({'A': 1, 'H': 2, 'N': 3})
        print('games:',self.games)

        # 假设 self.games 是你的 DataFrame
        # 提取胜场数据
        win_data = self.games[['Season', 'WTeamID', 'WScore', 'LScore', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 
                               'LTeamID', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF','ST']]
        win_data = win_data.rename(columns={
            'WTeamID': 'TeamID',
            'WScore': 'Score',
            'LScore': 'OpponentScore',  # 对手得分（即失分）
            'LTeamID': 'OpponentTeamID',  # 对手球队ID
            'WFGM': 'FGM',  # 球队自身数据
            'WFGA': 'FGA',
            'WFGM3': 'FGM3',
            'WFGA3': 'FGA3',
            'WFTM': 'FTM',
            'WFTA': 'FTA',
            'WOR': 'OR',
            'WDR': 'DR',
            'WAst': 'Ast',
            'WTO': 'TO',
            'WStl': 'Stl',
            'WBlk': 'Blk',
            'WPF': 'PF',
            'LFGM': 'OFGM',  # 对手数据列名前加 'O'
            'LFGA': 'OFGA',
            'LFGM3': 'OFGM3',
            'LFGA3': 'OFGA3',
            'LFTM': 'OFTM',
            'LFTA': 'OFTA',
            'LOR': 'OOR',
            'LDR': 'ODR',
            'LAst': 'OAst',
            'LTO': 'OTO',
            'LStl': 'OStl',
            'LBlk': 'OBlk',
            'LPF': 'OPF'
        })
    
        # 提取负场数据
        loss_data = self.games[['Season', 'LTeamID', 'LScore', 'WScore', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
                                'WTeamID', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF','ST']]
        loss_data = loss_data.rename(columns={
            'LTeamID': 'TeamID',
            'LScore': 'Score',
            'WScore': 'OpponentScore',  # 对手得分（即失分）
            'WTeamID': 'OpponentTeamID',  # 对手球队ID
            'LFGM': 'FGM',  # 球队自身数据
            'LFGA': 'FGA',
            'LFGM3': 'FGM3',
            'LFGA3': 'FGA3',
            'LFTM': 'FTM',
            'LFTA': 'FTA',
            'LOR': 'OR',
            'LDR': 'DR',
            'LAst': 'Ast',
            'LTO': 'TO',
            'LStl': 'Stl',
            'LBlk': 'Blk',
            'LPF': 'PF',
            'WFGM': 'OFGM',  # 对手数据列名前加 'O'
            'WFGA': 'OFGA',
            'WFGM3': 'OFGM3',
            'WFGA3': 'OFGA3',
            'WFTM': 'OFTM',
            'WFTA': 'OFTA',
            'WOR': 'OOR',
            'WDR': 'ODR',
            'WAst': 'OAst',
            'WTO': 'OTO',
            'WStl': 'OStl',
            'WBlk': 'OBlk',
            'WPF': 'OPF'
        })
        
        # 合并胜场和负场数据
        all_data = pd.concat([win_data, loss_data])
        
        # 按 Season 和 TeamID 分组，计算球队自身数据的平均值
        team_avg_data = all_data.groupby(['Season', 'TeamID', 'ST'])[[
            'Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF'
        ]].mean().reset_index()

        team_sum_data = all_data.groupby(['Season', 'TeamID', 'ST'])[[
            'Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF'
        ]].sum().reset_index()

        game_count = all_data.groupby(['Season', 'TeamID', 'ST']).size().reset_index(name='Gamecount')
        
        # 按 Season 和 TeamID 分组，计算对手数据的平均值
        opponent_avg_data = all_data.groupby(['Season', 'TeamID','ST'])[[
            'OpponentScore', 'OFGM', 'OFGA', 'OFGM3', 'OFGA3', 'OFTM', 'OFTA', 'OOR', 'ODR', 'OAst', 'OTO', 'OStl', 'OBlk', 'OPF'
        ]].mean().reset_index()
        opponent_sum_data = all_data.groupby(['Season', 'TeamID','ST'])[[
            'OpponentScore', 'OFGM', 'OFGA', 'OFGM3', 'OFGA3', 'OFTM', 'OFTA', 'OOR', 'ODR', 'OAst', 'OTO', 'OStl', 'OBlk', 'OPF'
        ]].sum().reset_index()
    
        # 合并球队自身数据和对手数据
        average_data = pd.merge(team_avg_data, opponent_avg_data,  on=['Season', 'TeamID','ST'], how='left')
        sum_data = pd.merge(team_sum_data, opponent_sum_data, on=['Season', 'TeamID','ST'], how='left')
        sum_data = pd.merge(sum_data, game_count, on=['Season', 'TeamID','ST'], how='left')

        average_data['Off Possession'] = average_data['FGA']-average_data['OR']+average_data['TO']+0.44*average_data['FTA']
        average_data['Def Possession'] = average_data['OFGA']-average_data['OOR']+average_data['OTO']+0.44*average_data['OFTA']
        average_data['Off Efficiency'] = average_data['Score']/(0.96*(average_data['Off Possession']))*100
        average_data['Def Efficiency'] = average_data['OpponentScore']/(0.96*(average_data['Def Possession']))*100
        average_data['Net Efficiency'] = average_data['Off Efficiency'] -  average_data['Def Efficiency'] 
        average_data['3 Efficiency'] = 3*average_data['FGM3']/average_data['Off Possession']*100
        average_data['Point Diff'] = average_data['Score']-average_data['OpponentScore']
        self.efficiency1 = average_data[average_data['ST']=='S'][['Season','TeamID','Off Efficiency','Def Efficiency','Net Efficiency','3 Efficiency','Point Diff']]
        self.efficiency2 = average_data[average_data['ST']=='T'][['Season','TeamID','Off Efficiency','Def Efficiency','Net Efficiency','3 Efficiency','Point Diff']]
    
        sum_data['FG%'] = sum_data['FGM']/sum_data['FGA']
        sum_data['FG3%'] = sum_data['FGM3']/sum_data['FGA3']
        sum_data['FT%'] = sum_data['FTM']/sum_data['FTA']
        sum_data['TS%'] = sum_data['Score']/(2*(sum_data['FGA']+0.44*sum_data['FTA']))
        sum_data['eFG%'] = (sum_data['FGM']+sum_data['FGM3']*0.5)/sum_data['FGA']
        sum_data['DRB%'] = (sum_data['OR']+sum_data['DR'])/(sum_data['OR']+sum_data['DR']+sum_data['OOR']+sum_data['ODR'])
        self.rate1 = sum_data[sum_data['ST']=='S'][['Season','TeamID','TS%','DRB%']]
        self.rate2 = sum_data[sum_data['ST']=='T'][['Season','TeamID','TS%','DRB%']]

        def calculate_top25_win_rate(self):
                # 获取POM排名前25的球队
                top25_teams = self.data['MMasseyOrdinals'][self.data['MMasseyOrdinals']['OrdinalRank'] <= 25]
                top25_teams = top25_teams[['Season', 'TeamID']]
            
                # 筛选出所有涉及POM排名前25球队的比赛
                # 情况1：POM排名前25球队作为胜队
                top25_wins = pd.merge(self.games, top25_teams, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how='inner')
                # 情况2：POM排名前25球队作为负队
                top25_losses = pd.merge(self.games, top25_teams, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], how='inner')
            
                # 合并两种情况
                top25_games = pd.concat([top25_wins, top25_losses]).drop_duplicates()
            
                # 计算每支球队在每个赛季面对POM排名前25球队的胜场和负场
                wins = top25_games.groupby(['Season', 'WTeamID']).size().reset_index(name='Wins')
                losses = top25_games.groupby(['Season', 'LTeamID']).size().reset_index(name='Losses')
               
               # 合并 wins 和 losses
                win_rate = pd.merge(wins, losses, left_on=['Season', 'WTeamID'], right_on=['Season', 'LTeamID'], how='outer', suffixes=('_win', '_loss'))
                
                # 整合 WTeamID 和 LTeamID 为一个 TeamID 列
                win_rate['TeamID'] = win_rate['WTeamID'].combine_first(win_rate['LTeamID'])
                
                # 选择需要的列
                win_rate = win_rate[['Season', 'TeamID', 'Wins', 'Losses']].fillna(0)
            
                # 计算胜率
                win_rate['WinRate'] = win_rate.apply(
                lambda row: row['Wins'] / (row['Wins'] + row['Losses']) if (row['Wins'] + row['Losses']) != 0 else 0,axis=1)
                win_rate = win_rate.drop(columns=['Losses'])
                return win_rate
        
        def is_in_major_conference(self):
                # 读取联盟数据
                team_conferences = pd.concat([self.data['MTeamConferences'], self.data['WTeamConferences']])
                major_conferences = ['sec', 'big_ten', 'acc', 'big_twelve']  # BIG Ten的缩写是B10
        
                # 判断球队是否属于四大联盟
                team_conferences['IsMajor'] = team_conferences['ConfAbbrev'].isin(major_conferences)
                team_conferences = team_conferences[team_conferences['Season']>=2003][['Season', 'TeamID', 'IsMajor']].astype(int)
        
                return team_conferences
        self.situation = pd.merge(calculate_top25_win_rate(self),is_in_major_conference(self),on=['Season','TeamID'],how='outer').fillna(0)
        print(self.situation)

        seeds_df = pd.concat([self.data['MNCAATourneySeeds'], self.data['WNCAATourneySeeds']])
        seeds_df['Seed'] = seeds_df['Seed'].str[1:3].astype(int)
        self.seed = seeds_df[['Season','TeamID','Seed']].fillna(32)
        self.seed = self.seed[self.seed['Season']>=2003]
        print(self.seed)

        final = pd.merge(self.rank,self.efficiency1,on=['Season','TeamID'],how='outer')
        final.loc[final['TeamID'] >= 2000, 'OrdinalRank'] = final.loc[final['TeamID'] >= 2000, 'OrdinalRank'].fillna(100)
        final = pd.merge(final,self.seed,on = ['Season','TeamID'],how = 'outer')
        final['Seed'] = final['Seed'].fillna(64)
        final = pd.merge(final,self.rate1,on=['Season','TeamID'])
        final = pd.merge(final,self.situation,on=['Season','TeamID'])
        final['TeamID'] = final['TeamID'].astype(int)
        self.final = final
        EloRating_men = pd.read_csv('/kaggle/input/elorating-rank/EloRating_mens_03.csv')
        EloRating_women = pd.read_csv('/kaggle/input/elorating-rank/EloRating_womens_03.csv')
        EloRating =  pd.concat([EloRating_men, EloRating_women], axis=0)
        self.final = pd.merge(self.final,EloRating,on=['Season','TeamID'])
        self.final.drop(columns='TeamName', inplace=True)
        self.final['Rating'].fillna(30)
        print('final:',self.final)

        self.sub = self.data['SampleSubmissionStage2']
        self.sub['Season'] = self.sub['ID'].map(lambda x: x.split('_')[0]).astype(int)
        self.sub['Team1'] = self.sub['ID'].map(lambda x: x.split('_')[1]).astype(int)
        self.sub['Team2'] = self.sub['ID'].map(lambda x: x.split('_')[2]).astype(int)
        print(self.sub)

        self.train = pd.concat([self.data['MRegularSeasonCompactResults'], self.data['WRegularSeasonCompactResults']])
        self.train['Team1'] = self.train[['WTeamID', 'LTeamID']].min(axis=1)
        self.train['Team2'] = self.train[['WTeamID', 'LTeamID']].max(axis=1)
        self.train['score_diff'] = self.train['WScore']-self.train['LScore']
        self.train['Score_Diff'] = self.train.apply(lambda r: -r['score_diff'] if r['Team1'] == r['LTeamID'] else r['score_diff'], axis=1).astype(float)
        self.train = self.train[self.train['Season']>=2003][['Season','Team1','Team2','Score_Diff']]
        print(self.train)

        
        self.train = pd.merge(self.train,self.final,left_on=['Season','Team1'],right_on=['Season','TeamID'])
        self.train = pd.merge(self.train,self.final,left_on=['Season','Team2'],right_on=['Season','TeamID'], suffixes=('_team1' , '_team2'))
        self.train = self.train.drop(columns=['TeamID_team1', 'TeamID_team2'])
        self.train['Off Efficiency_diff'] = self.train['Off Efficiency_team1']-self.train['Off Efficiency_team2']
        self.train['Def Efficiency_diff'] = self.train['Def Efficiency_team1']-self.train['Def Efficiency_team2']
        self.train['Net Efficiency_diff'] = self.train['Net Efficiency_team1']-self.train['Net Efficiency_team2']
        self.train['3 Efficiency_diff'] = self.train['3 Efficiency_team1']-self.train['3 Efficiency_team2']
        self.train['DRB%_diff'] = self.train['DRB%_team1']-self.train['DRB%_team2']
        self.train['Seed_diff'] = self.train['Seed_team1']-self.train['Seed_team2']
        print(self.train)
        print("Data loading and preprocessing completed.")
        self.sub = pd.merge(self.sub,self.final,left_on=['Season','Team1'],right_on=['Season','TeamID'])
        self.sub = pd.merge(self.sub,self.final,left_on=['Season','Team2'],right_on=['Season','TeamID'], suffixes=('_team1' , '_team2'))
        self.sub = self.sub.drop(columns=['TeamID_team1', 'TeamID_team2'])
        self.sub['Off Efficiency_diff'] = self.sub['Off Efficiency_team1']-self.sub['Off Efficiency_team2']
        self.sub['Def Efficiency_diff'] = self.sub['Def Efficiency_team1']-self.sub['Def Efficiency_team2']
        self.sub['Net Efficiency_diff'] = self.sub['Net Efficiency_team1']-self.sub['Net Efficiency_team2']
        self.sub['3 Efficiency_diff'] = self.sub['3 Efficiency_team1']-self.sub['3 Efficiency_team2']
        self.sub['DRB%_diff'] = self.sub['DRB%_team1']-self.sub['DRB%_team2']
        self.sub['Seed_diff'] = self.sub['Seed_team1']-self.sub['Seed_team2']
        train_data = self.train[self.train['Season']<=2024]
        print(self.sub)

               # 提取锦标赛数据
        tourney_games = pd.merge(self.efficiency2,self.rate2,on=['Season','TeamID'])
        tourney_games = pd.merge(tourney_games,self.seed,on=['Season','TeamID'])
        self.tourney_games = pd.concat([self.data['MNCAATourneyCompactResults'], self.data['WNCAATourneyCompactResults']])
        self.tourney_games['WLoc'] = self.tourney_games['WLoc'].map({'A': 1, 'H': 2, 'N': 3})
        
        # 处理锦标赛数据，生成特征
        self.tourney_games['Team1'] = self.tourney_games[['WTeamID', 'LTeamID']].min(axis=1)
        self.tourney_games['Team2'] = self.tourney_games[['WTeamID', 'LTeamID']].max(axis=1)
        self.tourney_games['score_diff'] = self.tourney_games['WScore'] - self.tourney_games['LScore']
        self.tourney_games['Score_Diff'] = self.tourney_games.apply(
            lambda r: -r['score_diff'] if r['Team1'] == r['LTeamID'] else r['score_diff'], axis=1
        )
        self.tourney_games = self.tourney_games[self.tourney_games['Season'] >= 2003][['Season', 'Team1', 'Team2', 'Score_Diff']]
        
        # 合并锦标赛数据与特征数据
        self.tourney_games = pd.merge(
            self.tourney_games, tourney_games, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID']
        )
        self.tourney_games = pd.merge(
            self.tourney_games, tourney_games, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'], suffixes=('_team1', '_team2')
        )
        self.tourney_games = self.tourney_games.drop(columns=['TeamID_team1', 'TeamID_team2'])
        
        # 生成特征差异
        self.tourney_games['Off Efficiency_diff'] = self.tourney_games['Off Efficiency_team1'] - self.tourney_games['Off Efficiency_team2']
        self.tourney_games['Def Efficiency_diff'] = self.tourney_games['Def Efficiency_team1'] - self.tourney_games['Def Efficiency_team2']
        self.tourney_games['Net Efficiency_diff'] = self.tourney_games['Net Efficiency_team1'] - self.tourney_games['Net Efficiency_team2']
        self.tourney_games['3 Efficiency_diff'] = self.tourney_games['3 Efficiency_team1'] - self.tourney_games['3 Efficiency_team2']
        self.tourney_games['DRB%_diff'] = self.tourney_games['DRB%_team1'] - self.tourney_games['DRB%_team2']
        self.tourney_games['Seed_diff'] = self.tourney_games['Seed_team1'] - self.tourney_games['Seed_team2']
        print(set(self.train.columns.tolist())-set(self.tourney_games.columns.tolist()))
        add_tourney = list(set(self.train.columns.tolist())-set(self.tourney_games.columns.tolist()))
        add_tourney.append('Season')
        add_tourney.append('Team1')
        add_tourney.append('Team2')
        self.tourney_games = pd.merge(self.tourney_games,self.train[add_tourney],on=['Season','Team1','Team2'])
        self.result =  tourney_cresults[tourney_cresults['Season']>=2003]
        self.result['result'] = (self.result['WTeamID'] < self.result['LTeamID']).astype(int)
        print(self.tourney_games)
        print("Tournament data loaded and processed.")
        


    def train_model(self):
        label = 'Score_Diff'
        feature_columns = [col for col in self.train.columns if col != label]
        self.model1 = TabularPredictor(label = label,eval_metric='mse',problem_type='regression')
        self.model1.fit(train_data=self.train[self.train['Season']<=2024],
                      presets='best_quality',
                      num_bag_folds=10,
                      num_stack_levels=5,
                      time_limit=120)  
        self.model1.save('model1')
         # 在测试集上评估模型


    def predict_win_probability(self):
        # 预测分差
        test_data = self.sub
        diff_predictions = self.model1.predict(test_data)
        
        # 使用 sigmoid 函数将分差转换为胜率
        win_probability = 1 / (1 + np.exp(-diff_predictions))
        return win_probability



    def predict_tournament(self):
        # 预测胜率
        win_probability = self.predict_win_probability()
        
        
        # 将结果保存到 submission 文件
        self.sub['Pred'] = win_probability
        self.sub['ID'] = self.sub['Season'].astype(str) + '_' + self.sub['Team1'].astype(str) + '_' + self.sub['Team2'].astype(str) 
        self.sub[['ID', 'Pred']].to_csv('submission.csv', index=False)
        print("Tournament predictions saved to submission.csv.")

    def run_all(self):
        self.load_data()
        self.train_model()
        print("train_model完毕")
        self.predict_win_probability()
        self.predict_tournament()
        print("文件已保存")
            

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting autogluon
  Downloading autogluon-1.2-py3-none-any.whl.metadata (11 kB)
Collecting ray==2.10.0
  Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (13 kB)
Collecting autogluon.core==1.2 (from autogluon.core[all]==1.2->autogluon)
  Downloading autogluon.core-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.2 (from autogluon)
  Downloading autogluon.features-1.2-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.2 (from autogluon.tabular[all]==1.2->autogluon)
  Downloading autogluon.tabular-1.2-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.multimodal==1.2 (from autogluon)
  Downloading autogluon.multimodal-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.timeseries==1.2 (from autogluon.timeseries[all]==1.2->autogluon)
  Downloading autogluon.timeseries-1.2-p

In [2]:
if __name__ == "__main__":
    data_dir = '/kaggle/input/march-machine-learning-mania-2025/**'  # Or a local dir
    predictor = TournamentPredictor(data_dir)
    predictor.run_all()
    

games:         Season  DayNum  WTeamID  WScore  LTeamID  LScore  WLoc  NumOT  WFGM  \
0         2003      10     1104      68     1328      62     3      0    27   
1         2003      10     1272      70     1393      63     3      0    26   
2         2003      11     1266      73     1437      61     3      0    24   
3         2003      11     1296      56     1457      50     3      0    18   
4         2003      11     1400      77     1208      71     3      0    30   
...        ...     ...      ...     ...      ...     ...   ...    ...   ...   
202861    2024     147     3163      80     3425      73     1      0    28   
202862    2024     147     3234      94     3261      87     2      0    32   
202863    2024     151     3234      71     3163      69     3      0    27   
202864    2024     151     3376      78     3301      59     3      0    33   
202865    2024     153     3376      87     3234      75     3      0    35   

        WFGA  ...  LFTM  LFTA  LOR  LDR  LAs

No path specified. Models will be saved in: "AutogluonModels/ag-20250319_161421"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Nov 10 10:07:59 UTC 2024
CPU Count:          4
Memory Avail:       29.74 GB / 31.35 GB (94.9%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=5, num_bag_folds=10, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation d

                    ID  Pred  Season  Team1  Team2  OrdinalRank_team1  \
0       2025_1101_1102   0.5    2025   1101   1102              219.0   
1       2025_1101_1103   0.5    2025   1101   1103              219.0   
2       2025_1101_1104   0.5    2025   1101   1104              219.0   
3       2025_1101_1105   0.5    2025   1101   1105              219.0   
4       2025_1101_1106   0.5    2025   1101   1106              219.0   
...                ...   ...     ...    ...    ...                ...   
131402  2025_3477_3479   0.5    2025   3477   3479              100.0   
131403  2025_3477_3480   0.5    2025   3477   3480              100.0   
131404  2025_3478_3479   0.5    2025   3478   3479              100.0   
131405  2025_3478_3480   0.5    2025   3478   3480              100.0   
131406  2025_3479_3480   0.5    2025   3479   3480              100.0   

        Off Efficiency_team1  Def Efficiency_team1  Net Efficiency_team1  \
0                  99.124908            105.386

	Running DyStack sub-fit in a ray process to avoid memory leakage. Enabling ray logging (enable_ray_logging=True). Specify `ds_args={'enable_ray_logging': False}` if you experience logging issues.
2025-03-19 16:14:25,389	INFO worker.py:1743 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
		Context path: "/kaggle/working/AutogluonModels/ag-20250319_161421/ds_sub_fit/sub_fit_ho"
[36m(_dystack pid=223)[0m Running DyStack sub-fit ...
[36m(_dystack pid=223)[0m Beginning AutoGluon training ... Time limit = 24s
[36m(_dystack pid=223)[0m AutoGluon will save models to "/kaggle/working/AutogluonModels/ag-20250319_161421/ds_sub_fit/sub_fit_ho"
[36m(_dystack pid=223)[0m Train Data Rows:    168618
[36m(_dystack pid=223)[0m Train Data Columns: 35
[36m(_dystack pid=223)[0m Label Column:       Score_Diff
[36m(_dystack pid=223)[0m Problem Type:       regression
[36m(_dystack pid=223)[0m Preprocessing data ...
[36m(_dystack pid=223)[0m Using Fe

train_model完毕
Tournament predictions saved to submission.csv.
文件已保存
