In [None]:
import itertools, gc

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost
import lightgbm
import catboost

import warnings
warnings.simplefilter('ignore')

INPUT_PATH = '../input/ncaaw-march-mania-2021/WDataFiles_Stage2'

In [None]:
def make_key(seed, col):
    return seed['Season'].astype(str) + '_' + seed[col].astype(str)

def make_slot():
    slot = pd.read_csv('{}/WNCAATourneySlots.csv'.format(INPUT_PATH))
    slot['Round'] = slot['Slot'].str[1].astype(int)
    rounds = slot[slot['Round'] == 1].copy()
    rounds.rename(columns={'Slot': 1}, inplace=True)
    rounds.drop('Round', axis=1, inplace=True)

    slot_uppers = slot[slot['Round'] > 1]
    strong_map = slot_uppers.set_index('StrongSeed')['Slot']
    weak_map = slot_uppers.set_index('WeakSeed')['Slot']

    for i in range(2, 7):
        before_slot = rounds[i - 1]

        after_col = i
        rounds[after_col] = before_slot.map(strong_map)
        rounds.loc[rounds[after_col].isnull(), after_col] = before_slot.map(weak_map)

    rounds.set_index(list(range(1, 7)), inplace=True)
    rounds = pd.concat([rounds['StrongSeed'].rename('Seed'), rounds['WeakSeed'].rename('Seed')]).to_frame()
    rounds.reset_index(inplace=True)
    rounds.sort_values('Seed', inplace=True)
    rounds.set_index('Seed', inplace=True)
    
    stack = rounds.stack().to_frame()
    stack.reset_index(inplace=True)
    stack.columns = ['Seed', 'Round', 'Slot']
    
    unique = stack['Seed'].unique()
    product = pd.DataFrame(list(itertools.product(unique, unique)), columns=['T_Seed', 'O_Seed'])
    product = product[product['T_Seed'] != product['O_Seed']]

    t = pd.merge(product, stack.rename(columns={'Seed': 'T_Seed'}), on='T_Seed', how='left')
    t.drop('Round', axis=1, inplace=True)
    t = pd.merge(t, stack.rename(columns={'Seed': 'O_Seed'}), on=['O_Seed', 'Slot'], how='left')
    t.dropna(inplace=True)
    t['Round'] = t['Round'].astype(int)
    return t.groupby(['T_Seed', 'O_Seed'], as_index=False).min()

season = pd.read_csv('{}/WSeasons.csv'.format(INPUT_PATH), parse_dates=['DayZero'])
season.set_index('Season', inplace=True)
print('season:', season.shape)
display(season.head())

seed = pd.read_csv('{}/WNCAATourneySeeds.csv'.format(INPUT_PATH))
seed['Key'] = make_key(seed, 'TeamID')
seed['SeedNo'] = seed['Seed'].str[1:].astype(int)
seed.set_index('Key', inplace=True)
print('seed:', seed.shape)
display(seed.head())

slot = make_slot()
print('slot:', slot.shape)
display(slot.head())

In [None]:
def convert_compact(f):
    f.sort_values(['Season', 'DayNum'], inplace=True)

    f['LLoc'] = 'N'
    f.loc[f['WLoc'] == 'H', 'LLoc'] = 'A'
    f.loc[f['WLoc'] == 'A', 'LLoc'] = 'H'
    
    sides = []
    for v1, v2 in [('W', 'L'), ('L', 'W')]:
        f['{}ScoreDiff'.format(v1)] = f['{}Score'.format(v1)] - f['{}Score'.format(v2)]
        
        cols = ['{}TeamID'.format(v1), '{}TeamID'.format(v2), '{}Score'.format(v1), '{}ScoreDiff'.format(v1), '{}Loc'.format(v1)]
        side = f[cols].copy()
        side.columns = ['TeamID', 'OpponentID', 'Score', 'ScoreDiff', 'Loc']
        side['Judge'] = 1 if v1 == 'W' else 0
        sides.append(side)
    
    cols = ['Season', 'DayNum', 'NumOT']
    for side in sides:
        side[cols] = f[cols]

    new_f = pd.concat(sides)

    new_f['Loc'] = new_f['Loc'].map({'H': 1, 'N': 0, 'A': -1})

    new_keys = ['Season', 'DayNum', 'TeamID', 'OpponentID']
    new_f.set_index(new_keys, inplace=True)
    new_f.sort_index(inplace=True)
    new_f.reset_index(inplace=True)

    return new_f

def convert_detail(f):
    f.sort_values(['Season', 'DayNum'], inplace=True)
    f.drop(['WScore', 'LScore', 'WLoc', 'NumOT'], axis=1, inplace=True)

    sides = []
    for v1, v2 in [('W', 'L'), ('L', 'W')]:
        from_cols = ['{}TeamID'.format(v1), '{}TeamID'.format(v2)]
        to_cols = ['TeamID', 'OpponentID']
        bases = ['FGM', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'Blk']
        for base in bases:
            for v, i in [(v1, 'T')]:
                from_cols.append('{}{}'.format(v1, base))
                to_cols.append(base)

        side = f[from_cols].copy()
        side.columns = to_cols
        sides.append(side)
    
    for side in sides:
        side[['Season', 'DayNum']] = f[['Season', 'DayNum']]

    new_f = pd.concat(sides)
    new_keys = ['Season', 'DayNum', 'TeamID', 'OpponentID']
    new_f.set_index(new_keys, inplace=True)
    new_f.sort_index(inplace=True)
    new_f.reset_index(inplace=True)
    
    return new_f

def add_match_features(f, seed, slot):
    t_key = make_key(f, 'TeamID')
    o_key = make_key(f, 'OpponentID')

    f['T_Seed'] = t_key.map(seed['Seed']).fillna('A17')
    f['O_Seed'] = o_key.map(seed['Seed']).fillna('A17')
    f['T_SeedNo'] = t_key.map(seed['SeedNo']).fillna(17).astype(int)
    f['O_SeedNo'] = o_key.map(seed['SeedNo']).fillna(17).astype(int)
    f['M_SeedNoDiff'] = f['O_SeedNo'] - f['T_SeedNo']

    f = pd.merge(f, slot[['T_Seed', 'O_Seed', 'Round']], on=['T_Seed', 'O_Seed'], how='left')
    f['Round'] = f['Round'].fillna(0).astype(int)

    return f

def make_strong(f, period, size, target_round):
    strongs = []
    for season in range(f['Season'].min() + period - 1, f['Season'].max() + 1):
        begin, end = season - period, season - 1
        target = f[f['Season'].between(begin, end)]
    
        s =  target[target['Round'] >= target_round]
        s = s[['TeamID', 'Judge']].groupby('TeamID').agg({'Judge': ('mean', 'count')})
        s.columns = ['Mean', 'Count']
        s = s[(s['Mean'] > 0.0)]

        s = target[target['TeamID'].isin(s.index.values)].copy()
        s['StrongPoint'] = s['Judge'] * s['Round']
        s = s[['TeamID', 'StrongPoint']].groupby('TeamID', as_index=False).mean()
        s.sort_values('StrongPoint', ascending=False, inplace=True)
        s = s[:size]
        s['Season'] = season
        s['Strong'] = np.flipud(range(1, size + 1))

        strongs.append(s)

    strong = pd.concat(strongs)
    strong['Key'] = make_key(strong, 'TeamID')
    strong.set_index('Key', inplace=True)
    
    return strong

def load_and_convert(key, seed, slot):
    c = pd.read_csv('{}/W{}CompactResults.csv'.format(INPUT_PATH, key))
    c = convert_compact(c)
    c = add_match_features(c, seed, slot)
    
    d = pd.read_csv('{}/W{}DetailedResults.csv'.format(INPUT_PATH, key))
    d = convert_detail(d)
    
    return pd.merge(c, d, on=['Season', 'DayNum', 'TeamID', 'OpponentID'], how='left')

tourney = load_and_convert('NCAATourney', seed, slot)
print('tourney:', tourney.shape)
display(tourney.head())

regular = load_and_convert('RegularSeason', seed, slot)
regular['Round'] = 0
print('regular:', regular.shape)
display(regular.head())

strong = make_strong(tourney, 8, 5, 4)
print('strong:', strong.shape)
display(strong.head())

In [None]:
def add_team_features(m, strong):
    t_key = make_key(m, 'TeamID')
    o_key = make_key(m, 'OpponentID')

    m['T_Strong'] = t_key.map(strong['Strong']).fillna(0)
    m['O_Strong'] = o_key.map(strong['Strong']).fillna(0)

    return m

def make_match(f, seed, slot, strong, tt):
    m = pd.DataFrame()
    
    keys = ['Season', 'TeamID', 'OpponentID']
    m[keys] = f[keys]

    m = add_match_features(m, seed, slot)
    m = add_team_features(m, strong)

    if tt == 'train':
        m['Judge'] = f['Judge']

    return m

tourney_match = make_match(tourney, seed, slot, strong, 'train')
print('tourney_match:', tourney_match.shape)
display(tourney_match.head())

regular_match = make_match(regular, seed, slot, strong, 'train')
regular_match = regular_match[(regular_match['T_SeedNo'] <= 4) & (regular_match['O_SeedNo'] <= 4)]
print('regular_match:', regular_match.shape)
display(regular_match.head())

submission = pd.read_csv('{}/WSampleSubmissionStage2.csv'.format(INPUT_PATH))
sub_key = submission['ID'].str.split('_', expand=True).astype(int)
sub_key.columns = ['Season', 'TeamID', 'OpponentID']
sub_feature = make_match(sub_key, seed, slot, strong, 'predict')
print('sub_feature:', sub_feature.shape)
display(sub_feature.head())

In [None]:
def make_data(f, tt, scaler=None):
    x = f.set_index(['Season', 'TeamID', 'OpponentID'])
    x.drop(['Round', 'T_Seed', 'O_Seed'], axis=1, inplace=True)
 
    if tt != 'predict':
        y = x['Judge']
        x.drop('Judge', axis=1, inplace=True)
    
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(x)
    x = scaler.transform(x)
    
    if tt == 'predict':
        return x, None, scaler

    y = y.values
    
    return x, y, scaler

def learns(params, data, models):
    train_x, train_y, scaler = make_data(data['train'], 'train')
    test_x, test_y, scaler = make_data(data['test'], 'test', scaler)
    pred_x, _, _ = make_data(data['pred'], 'predict', scaler)
    
    result = {'train': {}, 'test': {}, 'pred': {}}
    for tt in ['train', 'test', 'pred']:
        result[tt]['pred'] = pd.DataFrame()
        result[tt]['score'] = []

    print('  ', train_x.shape, train_y.shape)    
        
    for name, model in models:
        state = params[name]['random_state']

        x = pd.DataFrame(train_x).sample(frac=1.0, random_state=state).values
        y = pd.DataFrame(train_y.reshape(-1, 1)).sample(frac=1.0, random_state=state).values[:, 0]            
        
        if name in ['xgb', 'lgbm', 'catboost']:
            x = np.concatenate([x, pd.DataFrame(x).sample(frac=0.01, random_state=state).values])
            y = np.concatenate([y, pd.DataFrame(y.reshape(-1, 1)).sample(frac=0.01, random_state=state).values[:, 0]])
            model.fit(x, y, eval_set=[(test_x, test_y)], verbose=0)
        else:
            model.fit(x, y)

        result['train']['pred'][name] = model.predict_proba(train_x)[:, 1]
        result['train']['score'].append((name, log_loss(train_y, result['train']['pred'][name].values)))

        result['test']['pred'][name] = model.predict_proba(test_x)[:, 1]
        result['test']['score'].append((name, log_loss(test_y, result['test']['pred'][name].values)))
        
        result['pred']['pred'][name] = model.predict_proba(pred_x)[:, 1]

        print('    ', result['train']['score'][-1], result['test']['score'][-1])

    result['train']['score'] = pd.DataFrame(result['train']['score'], columns=['model_name', 'score'])
    result['test']['score'] = pd.DataFrame(result['test']['score'], columns=['model_name', 'score'])
    
    return result

def make_models(names, params):
    models = []
    for name in names:
        if name == 'extra_trees':
            models.append(('extra_trees', ExtraTreesClassifier(**params['extra_trees'])))
        elif name == 'random_forest':
            models.append(('random_forest', RandomForestClassifier(**params['random_forest'])))
        elif name == 'xgb':
            models.append(('xgb', xgboost.XGBClassifier(**params['xgb'])))
        elif name == 'lgbm':
            models.append(('lgbm', lightgbm.LGBMClassifier(**params['lgbm'])))
        elif name == 'catboost':
            models.append(('catboost', catboost.CatBoostClassifier(verbose=0, **params['catboost'])))

    return models

def learn_seasons(models, params, test_seasons, target_seasons, tourney_seasons, regular_seasons, ignores, mix_size=0):
    data = {}
    train = pd.concat([
        tourney_match[tourney_match['Season'].isin(tourney_seasons)],
        regular_match[regular_match['Season'].isin(regular_seasons)]
    ])

    for season, rounds in ignores:
        for r in rounds:
            
            condition = (train['Season'] == season) & (train['Round'] == r)
            train = train[~condition]

    data['train'] = train
    data['test'] = tourney_match[tourney_match['Season'].isin(test_seasons)].copy()
    data['pred'] = sub_feature[sub_feature['Season'].isin(target_seasons)].copy()

    for tt in ['train', 'test', 'pred']:
        data[tt].reset_index(drop=True, inplace=True)

    result = learns(params, data, models)

    rank = result['test']['score'].groupby('model_name').mean().sort_values('score')

    for tt in ['train', 'test', 'pred']:
        result[tt]['pred'].reset_index(drop=True, inplace=True)
        
        if mix_size <= 0:
            mix_size = rank.shape[0]

        data[tt]['Pred'] = result[tt]['pred'][rank.index[0:mix_size]].mean(axis=1)

        data[tt].loc[data[tt]['M_SeedNoDiff'] >= 10, 'Pred'] = 1.0
        data[tt].loc[data[tt]['M_SeedNoDiff'] <= -10, 'Pred'] = 0.0
        data[tt].loc[(data[tt]['Round'] == 2) & (data[tt]['T_SeedNo'] == 1), 'Pred'] = 1.0
        data[tt].loc[(data[tt]['Round'] == 2) & (data[tt]['O_SeedNo'] == 1), 'Pred'] = 0.0

        data[tt].sort_values(['Season', 'TeamID', 'OpponentID'], inplace=True)
        data[tt].reset_index(drop=True, inplace=True)

        if tt == 'test':
            result[tt]['last_score'] = log_loss(data[tt]['Judge'].astype(np.float64), data[tt]['Pred'].astype(np.float64))
            print('mean:', result[tt]['last_score'])

    return data, result

def learn_states(file_name, random_states, mix_size, sub):
    last = {
        'test': [],
        'pred': [],
        'score': [],
        'mix_size': mix_size,
    }

    for state in random_states:
        model_names = ['extra_trees', 'random_forest', 'xgb', 'lgbm', 'catboost']

        for name in model_names:
            params[name].update({'random_state': state})

        models = make_models(model_names, params)

        data, result = learn_seasons(models, params, test_seasons, target_seasons, tourney_seasons, regular_seasons, ignores, mix_size)

        last['test'].append(data['test']['Pred'])
        last['pred'].append(data['pred']['Pred'])
        last['score'].append(result['test']['last_score'])

        del data, result
        gc.collect()

    last_pred = pd.DataFrame()
    for i, p in enumerate(last['pred']):
        last_pred[i] = p

    sub['Pred'] = last_pred.mean(axis=1)
    
    print('last score:', np.mean(last['score']))
    
    return sub

In [None]:
params = { 
    'extra_trees': {
        'n_estimators': 1400,
        'max_depth': 22,
        'max_features': 2,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'min_impurity_split': 0.21,
        'criterion': 'entropy',
    },
    'random_forest': {
        'n_estimators': 1200,
        'max_depth': 21,
        'max_features': 2,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'criterion': 'gini',
    },
    'xgb': {
        'n_estimators': 20000,
        'early_stopping_rounds': 512,
        'learning_rate': 0.045,
        'max_depth': 3,
        'max_delta_step': 0.7,
    },
    'lgbm': {
        'n_estimators': 20000,
        'early_stopping_rounds': 512,
        'learning_rate': 0.034,
        'feature_fraction': 0.8,
    },
    'catboost': {
        'n_estimators': 20000,
        'early_stopping_rounds': 512,
        'learning_rate': 0.03,
        'max_depth': 6,
        'l2_leaf_reg': 3,
        'subsample': 0.85,
        'use_best_model': True,
    },
#     'extra_trees': {
#         'max_depth': 12,
#     },
#     'random_forest': {
#     },
#     'xgb': {
#     },
#     'lgbm': {
#     },
#     'catboost': {
#     },
}



subs = []
for move_seasons in [[2016, 2017], [2018, 2019], [2016, 2018], [2017, 2019]]:
    target_seasons = [2021]
    test_seasons = [2016, 2017, 2018, 2019]
    tourney_seasons = [1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
    regular_seasons = [1998, 1999, 2000, 2001, 2004, 2006, 2008, 2010, 2013, 2014, 2015]

    ignores = [
        (1998, [2, 3, 6]), (1999, [2, 3, 5]), (2001, [3]), (2003, [1, 4]), (2007, [1, 3, 4]), (2009, [2, 3, 5]),
        (2010, [1, 3]), (2011, [2, 4]), (2012, [1, 4])
    ]

    random_states = range(48, 64)

    for move_season in move_seasons:
        test_seasons.remove(move_season)
        tourney_seasons.append(move_season)
        regular_seasons.append(move_season)
    
    print('====', test_seasons, '====')

    sub = learn_states('subs_state8_mix1.csv', random_states, 1, submission.copy())
    subs.append(sub)

In [None]:
sub_mixs = pd.concat(subs)
sub_mixs = sub_mixs.groupby('ID').mean()
sub_mixs.to_csv('submission.csv')
sub_mixs.head()