In [1]:
import pandas as pd
import numpy as np
from fancyimpute import KNN
from sklearn import metrics
import csv

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
odds2018 = '../odds_data/ncaa_basketball_2017-18.xlsx'
odds2017 = '../odds_data/ncaa_basketball_2016-17.xlsx'
odds2016 = '../odds_data/ncaa_basketball_2015-16.xlsx'
odds2015 = '../odds_data/ncaa_basketball_2014-15.xlsx'
odds2014 = '../odds_data/ncaa_basketball_2013-14.xlsx'

In [3]:
odds2018_df = pd.read_excel(odds2018, header=0)
odds2017_df = pd.read_excel(odds2017, header=0)
odds2016_df = pd.read_excel(odds2016, header=0)
odds2015_df = pd.read_excel(odds2015, header=0)
odds2014_df = pd.read_excel(odds2014, header=0)

In [4]:
odds_dfs = [odds2018_df, odds2017_df, odds2016_df, odds2015_df, odds2014_df]

In [5]:
seasons = [2018, 2017, 2016, 2015, 2014]

1. Update team names
2. Change date
3. add season
4. impute data
5. matchups to rows
6. add outcome
7. concat dfs

**Team Names**

In [6]:
odds_teams_lookup_filepath = '../odds_teams_lookup.csv'

In [7]:
def odds_teams_dict(filepath):
    '''
    Create dictionary of school names and formatted school names for mapping
    '''
    team_names = pd.read_csv(filepath)
    team_names = team_names[['Teams', 'school']]
    team_dict = {}
    schools = team_names['Teams'].tolist()
    schools_format = team_names['school'].tolist()
    for school, schform in zip(schools, schools_format):
        team_dict[school] = schform
    return team_dict

In [8]:
def update_team_names(df):
    df['Team'] = df['Team'].map(odds_teams_dict(odds_teams_lookup_filepath))
    return df

**date work**

In [9]:
def date(row):
    '''Updates date format to prepare for unique ID generation'''
    row['Date'] = str(row['Date'])
    if len(row['Date']) == 3: 
        row['month'] = '0' + row['Date'][:1]
    else:
        row['month'] = row['Date'][:2]
    row['day'] = row['Date'][-2:]
    row['Date'] = '{}-{}-{}'.format(str(row['season']), str(row['month']), str(row['day']))
    return row

**Impute Data**

In [10]:
def string_split(df):
    '''Used in impute data function to split string data into separate df'''
    string_df = df[['VH', 'Team', 'Date']]
    df = df.drop(['VH', 'Team', 'Date'], axis=1)
    return string_df, df

In [11]:
def string_to_nan(row):
    '''Used in impute_data funciton to force strings in numeric df to NaNs'''
    row = pd.to_numeric(row, errors='coerce')
    return row

In [12]:
def impute_data(df):
    '''
    Input: DataFrame
    Output: DataFrame with imputted missing values
    '''
    
    # Split out string columns into separate df
    string_df, df = string_split(df)
    
    # save col names
    string_df_cols = string_df.columns.tolist()
    df_cols = df.columns.tolist()
    
    # Convert strings to NaNs
    df = df.apply(string_to_nan, axis=1)
    
    #impute NaNs in df
    X = df.values
    X_filled = KNN(k=3, verbose=False).complete(X)
    df = pd.DataFrame(X_filled, columns=df_cols)
    df = pd.merge(df, string_df, how='left', left_index=True, right_index=True)
    return df

**Feature Engineer and Matchups**

In [13]:
def prob(row):
    '''calc probability from ML'''
    if row['ML'] < 0:
        row['p'] = int(row['ML']) / int((row['ML']) - 100)
    elif row['ML'] > 0:
        row['p'] = 100 / int((row['ML']) + 100)
    return row

In [14]:
def spread(row):
    if row['p'] <= .5:
        row['spread'] = int(25 * row['p'] + -12)
    else:
        row['spread'] = int(-25 * row['p'] + 13)
    return row

In [15]:
def outcome(row):
    '''Adds vegas prediction, actual spread and actual W features'''
    if row['ML'] < 0:
        row['vegas'] = 1
    else:
        row['vegas'] = 0
    
    row['actual_spread'] = row['Final'] - row['Final_OP']
    
    if row['actual_spread'] > 0:
        row['W'] = 1
    else:
        row['W'] = 0
    
    return row

In [16]:
def matchups(df):
    
    # Drop uneeded columns
    df = df.drop(['1st', '2H', '2nd'], axis=1)
    
    # Add probability of winning column
    df = df.apply(prob, axis=1)
    
    # One hot encode VH column for counting
    df['VHohe'] = df['VH'].map({'V': 1, 'H': 0})
    
    # Create count column to use as merge ID
    df['count'] = df.groupby('VHohe').cumcount() + 1
    
    # Split df in to visitor and home team dfs
    df_v = df[df['VH'] == 'V']
    df_h = df[df['VH'] == 'H']
    
    # update column names for visitors df
    v_cols = df_v.columns.tolist()
    v_cols = ['{}_OP'.format(col) if col != 'count' else col for col in v_cols]
    df_v.columns = v_cols
    
    # Merge on count
    df = pd.merge(df_h, df_v, how='left', on='count')
    
    # Drop uneeded columns
    df = df.drop(['Rot', 'VH', 'VH_OP', 'Date_OP', 'Rot_OP', 'Open', 'Close', 
                  'Open_OP', 'Close_OP', 'season_OP'], axis=1)

    # Add outcome
    df = df.apply(outcome, axis=1)
    
    # spread
    df = df.apply(spread, axis=1)
    
    return df

**master func**

In [17]:
def set_up_odds_data(df_list, seasons_list):
    odds_df = pd.DataFrame()
    for df, season in zip(df_list, seasons_list):
        df = update_team_names(df)
        df['season'] = season
        df = df.apply(date, axis=1)
        df = df.drop(['month', 'day'], axis=1)
        df = impute_data(df)
        df = matchups(df)
        odds_df = odds_df.append(df, ignore_index=True)
    return odds_df

In [18]:
odds18_test = set_up_odds_data(odds_dfs, seasons)

In [19]:
odds18_test.tail()

Unnamed: 0,Final,ML,season,Team,Date,p,VHohe,count,Final_OP,ML_OP,Team_OP,p_OP,VHohe_OP,vegas,actual_spread,W,spread
3464,69.0,-180.0,2018.0,north-texas,2018-03-28,0.642857,0.0,3465,55.0,150.0,san-francisco,0.4,1.0,1,14.0,1,-3
3465,51.0,-220.0,2018.0,liberty,2018-03-28,0.6875,0.0,3466,67.0,180.0,illinois-chicago,0.357143,1.0,1,-16.0,0,-4
3466,99.0,-650.0,2018.0,northern-colorado,2018-03-28,0.866667,0.0,3467,80.0,450.0,sam-houston-state,0.181818,1.0,1,19.0,1,-8
3467,88.0,-220.0,2018.0,north-texas,2018-03-30,0.6875,0.0,3468,77.0,180.0,san-francisco,0.357143,1.0,1,11.0,1,-4
3468,76.0,-550.0,2018.0,northern-colorado,2018-03-30,0.846154,0.0,3469,71.0,400.0,illinois-chicago,0.2,1.0,1,5.0,1,-8


In [20]:
odds18_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3469 entries, 0 to 3468
Data columns (total 17 columns):
Final            3469 non-null float64
ML               3469 non-null float64
season           3469 non-null float64
Team             3465 non-null object
Date             3469 non-null object
p                3469 non-null float64
VHohe            3469 non-null float64
count            3469 non-null int64
Final_OP         3469 non-null float64
ML_OP            3469 non-null float64
Team_OP          3464 non-null object
p_OP             3469 non-null float64
VHohe_OP         3469 non-null float64
vegas            3469 non-null int64
actual_spread    3469 non-null float64
W                3469 non-null int64
spread           3469 non-null int64
dtypes: float64(10), int64(4), object(3)
memory usage: 460.8+ KB
