# Women's Full Season

This notebook grabs full seasons of women's games along with calculating full season ratings

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
import warnings
import optuna
from sklearn.preprocessing import MinMaxScaler
optuna.logging.set_verbosity(optuna.logging.WARNING)

def get_rankings(season):
    """
    Get full year (including postseason) rankings
    """
    print(f'season: {season}')
    df = pd.read_csv(fr'..\data\unprocessed\womens_sports_reference\womens_full_year_sports_reference_{season}.csv')
    X = (
        pd.get_dummies(df['Team']).astype('int8') -
        pd.get_dummies(df['Opponent']).astype('int8')
    )

    X['Home Field Advantage'] = df['Location'].copy()

    def get_gkf_data(X, y, w, groups, cv=3):
        """
        Converts training data to list of folds
        """
        np.random.seed(22)
        gkf = GroupKFold(n_splits=cv)

        data = []
        for train_index, test_index in gkf.split(X, y, groups=groups):
            X_train = X[train_index]
            X_test = X[test_index]

            y_train = y[train_index]
            y_test = y[test_index]

            # sample weights
            w_train = w[train_index]

            data.append((X_train, X_test, y_train, y_test, w_train))

        return data

    cv_data = get_gkf_data(X.to_numpy(), df['Result'].to_numpy(), df[['Adjusted Score Differential']].to_numpy(), df['Date'].to_numpy())

    def objective(trial, cv_data=cv_data):
        # model tuning
        C = trial.suggest_float('C', 0.1, 10, log=True)
        mod = LogisticRegression(penalty='l2', C=C, fit_intercept=False)
        minimum = trial.suggest_float('minimum', 0.1, 1.0, step=0.1)
        maximum = trial.suggest_float('maximum', 1.0, 8.0, step=0.5)
        if minimum >= maximum:
            maximum = minimum + 0.0001

        # cross validation
        y_actuals = []
        y_preds = []
        for X_train, X_test, y_train, y_test, w_train in cv_data:
            y_actuals.append(y_test)

            weights = MinMaxScaler(feature_range=(minimum, maximum)).fit_transform(w_train).reshape(-1)

            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')  # prevent convergence warnings
                mod.fit(X_train, y_train, sample_weight=weights)

            y_preds.append(mod.predict_proba(X_test)[:, 1])

        return log_loss(np.hstack(y_actuals), np.hstack(y_preds))

    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=22))
    study.optimize(objective, n_trials=100, show_progress_bar=True)

    minimum = study.best_params['minimum']
    maximum = study.best_params['maximum']

    if minimum >= maximum:
            maximum = minimum + 0.0001

    weight = MinMaxScaler(
        feature_range=(minimum, maximum)
    ).fit_transform(df[['Adjusted Score Differential']]).reshape(-1)

    mod = LogisticRegression(penalty='l2', C=study.best_params['C'], fit_intercept=False)

    mod.fit(X, df['Result'], sample_weight=weight)

    df_ratings = pd.DataFrame(
        {
            'Team': X.columns,
            'Rating': mod.coef_[0]
        }
    ).sort_values(by=['Rating'], ascending=False, ignore_index=True)

    df_ratings_display = df_ratings.loc[df_ratings['Team'] != 'Home Field Advantage', :].reset_index(drop=True)
    df_ratings_display.index += 1

    df_ratings_display.insert(0, 'Season', season)

    return df_ratings_display

In [2]:
df = pd.concat([get_rankings(season) for season in range(2008, 2024)], ignore_index=True)

df

season: 2008


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2009


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2010


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2011


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2012


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2013


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2014


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2015


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2016


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2017


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2018


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2019


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2020


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2021


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2022


  0%|          | 0/100 [00:00<?, ?it/s]

season: 2023


  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,Season,Team,Rating
0,2008,Connecticut,5.437914
1,2008,Tennessee,5.221798
2,2008,Stanford,4.717653
3,2008,North Carolina,4.605777
4,2008,Maryland,4.193176
...,...,...,...
5512,2023,Texas Southern,-3.813642
5513,2023,Navy,-4.039280
5514,2023,Mississippi Valley State,-4.365660
5515,2023,Saint Peter's,-4.708909


In [4]:
df['Past 4 Years Ratings'] = (
    df
    .groupby(['Team'])
    ['Rating']
    .rolling(window=4, min_periods=1)
    .mean()
    .reset_index()
    .set_index('level_1')
)['Rating']

df.rename(
    columns={
    'Rating': 'Past Year Rating'
    }, 
    inplace=True
)

df['Season'] += 1  # shift by a year so ratings are from past instead of the current rating

df = df.loc[df['Season'] >= 2012, :].reset_index(drop=True)

df

Unnamed: 0,Season,Team,Past Year Rating,Past 4 Years Ratings
0,2012,Connecticut,5.099069,5.576978
1,2012,Stanford,4.731230,4.639040
2,2012,Baylor,4.546418,3.658880
3,2012,Texas A&M,4.457265,3.738744
4,2012,Tennessee,4.306993,4.172733
...,...,...,...,...
4531,2024,Texas Southern,-3.813642,-1.357948
4532,2024,Navy,-4.039280,-2.028372
4533,2024,Mississippi Valley State,-4.365660,-3.570037
4534,2024,Saint Peter's,-4.708909,-2.127877


In [10]:
df.to_csv('../data/preprocessed/womens_my_rankings_full_season/womens_my_rankings_full_season.csv', index=False)

'Done'

'Done'