In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from fancyimpute import KNN
from sklearn import metrics
import re
import csv

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
odds2018 = '/Users/sec/galvanize/bracket_buster/odds_data/ncaa_basketball_2017-18.xlsx'
odds2017 = '/Users/sec/galvanize/bracket_buster/odds_data/ncaa_basketball_2016-17.xlsx'
odds2016 = '/Users/sec/galvanize/bracket_buster/odds_data/ncaa_basketball_2015-16.xlsx'
odds2015 = '/Users/sec/galvanize/bracket_buster/odds_data/ncaa_basketball_2014-15.xlsx'
odds2014 = '/Users/sec/galvanize/bracket_buster/odds_data/ncaa_basketball_2013-14.xlsx'
# odds2013 = '/Users/sec/galvanize/bracket_buster/odds_data/ncaa_basketball_2012-13.xlsx'

In [3]:
odds2018_df = pd.read_excel(odds2018, header=0)
odds2017_df = pd.read_excel(odds2017, header=0)
odds2016_df = pd.read_excel(odds2016, header=0)
odds2015_df = pd.read_excel(odds2015, header=0)
odds2014_df = pd.read_excel(odds2014, header=0)
# odds2013_df = pd.read_excel(odds2013, header=0)

In [4]:
odds2018_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 11 columns):
Date     8154 non-null int64
Rot      8154 non-null int64
VH       8154 non-null object
Team     8154 non-null object
1st      8154 non-null int64
2nd      8154 non-null int64
Final    8154 non-null int64
Open     8085 non-null object
Close    8137 non-null object
ML       7847 non-null object
2H       8100 non-null object
dtypes: int64(5), object(6)
memory usage: 700.8+ KB


## Update Team Names

In [5]:
# teams_df = odds2018_df.Team.value_counts()
# teams_df = pd.DataFrame(teams_df)
# teams_df.to_csv('new_odds_teams.csv')

### matched up names in csv

In [6]:
odds_teams_lookup_filepath = '../odds_teams_lookup.csv'

In [7]:
def odds_teams_dict(filepath):
    '''
    Create dictionary of school names and formatted school names for mapping
    '''
    team_names = pd.read_csv(filepath)
    team_names = team_names[['Teams', 'school']]
    team_dict = {}
    schools = team_names['Teams'].tolist()
    schools_format = team_names['school'].tolist()
    for school, schform in zip(schools, schools_format):
        team_dict[school] = schform
    return team_dict

In [8]:
def update_team_names(df):
    df['Team'] = df['Team'].map(odds_teams_dict(odds_teams_lookup_filepath))
    return df

In [9]:
odds_dfs = [odds2018_df, odds2017_df, odds2016_df, odds2015_df, odds2014_df]

### Impute NaNs

In [10]:
def string_split(df):
    '''Used in impute data function to split string data into separate df'''
    string_df = df[['VH', 'Team']]
    df = df.drop(['VH', 'Team'], axis=1)
    return string_df, df

In [11]:
def string_to_nan(row):
    '''Used in impute_data funciton to force strings in numeric df to NaNs'''
    row = pd.to_numeric(row, errors='coerce')
    return row

In [12]:
def impute_data(df):
    '''
    Input: DataFrame
    Output: DataFrame with imputted missing values
    '''
    
    # Split out string columns into separate df
    string_df, df = string_split(df)
    
    # save col names
    string_df_cols = string_df.columns.tolist()
    df_cols = df.columns.tolist()
    
    # Convert strings to NaNs
    df = df.apply(string_to_nan, axis=1)
    
    #impute NaNs in df
    X = df.values
    X_filled = KNN(k=3, verbose=False).complete(X)
    df = pd.DataFrame(X_filled, columns=df_cols)
    df = pd.merge(df, string_df, how='left', left_index=True, right_index=True)
    return df

In [13]:
# odds2018_df = impute_data(odds2018_df)

In [14]:
# odds2018_df.iloc[5510: 5520]

## Feature Engineer

In [15]:
def prob(row):
    '''calc probability from ML'''
    if row['ML'] < 0:
        row['p'] = int(row['ML']) / int((row['ML']) - 100)
    elif row['ML'] > 0:
        row['p'] = 100 / int((row['ML']) + 100)
    return row

In [16]:
def spread(row):
    if row['p'] <= .5:
        row['spread'] = int(25 * row['p'] + -12)
    else:
        row['spread'] = int(-25 * row['p'] + 13)
    return row

In [17]:
def outcome(row):
    '''Adds vegas prediction, actual spread and actual W features'''
    if row['ML'] < 0:
        row['vegas'] = 1
    else:
        row['vegas'] = 0
    
    row['actual_spread'] = row['Final'] - row['Final_v']
    
    if row['actual_spread'] > 0:
        row['W'] = 1
    else:
        row['W'] = 0
    
    return row

In [18]:
def date(row):
    '''Updates date format to prepare for unique ID generation'''
    row['Date'] = str(int(row['Date']))
    row['month'] = int(row['Date'][:2])
    row['day'] = int(row['Date'][-2:])
    row['Date'] = '{}-{}-{}'.format(str(row['Season']), str(row['month']), str(row['day']))
    return row

In [19]:
def matchups(df, season):
    
    # Drop uneeded columns
    df = df.drop(['1st', '2H', '2nd'], axis=1)
    
    # Add probability of winning column
    df = df.apply(prob, axis=1)
    
    # One hot encode VH column for counting
    df['VHohe'] = df['VH'].map({'V': 1, 'H': 0})
    
    # Create count column to use as merge ID
    df['count'] = df.groupby('VHohe').cumcount() + 1
    
    # Split df in to visitor and home team dfs
    df_v = df[df['VH'] == 'V']
    df_h = df[df['VH'] == 'H']
    
    # update column names for visitors df
    v_cols = df_v.columns.tolist()
    v_cols = ['{}_v'.format(col) if col != 'count' else col for col in v_cols]
    df_v.columns = v_cols
    
    # Merge on count
    df = pd.merge(df_h, df_v, how='left', on='count')
    
    # Drop uneeded columns
    df = df.drop(['Rot', 'VH', 'VH_v', 'Date_v', 'Rot_v', 'Open', 'Close', 
                  'Open_v', 'Close_v'], axis=1)
    
    # Add Season
    df['Season'] = season
    
    # Add outcome
    df = df.apply(outcome, axis=1)
    
    # spread
    df = df.apply(spread, axis=1)
    
    # Update date format
    df = df.apply(date, axis=1)
    
    return df

In [20]:
# odds2018_df = impute_data(odds2018_df)
# odds2018_df = matchups(odds2018_df, 2018)
# odds2018_df = odds2018_df.apply(outcome, axis=1)
# odds2017_df = odds2017_df.apply(prob, axis=1)
# odds2016_df = odds2016_df.apply(prob, axis=1)
# odds2015_df = odds2015_df.apply(prob, axis=1)
# odds2014_df = odds2014_df.apply(prob, axis=1)
# odds2013_df = odds2013_df.apply(prob, axis=1)

In [21]:
odds2018_df.head()

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,Final,Open,Close,ML,2H
0,1110,517,V,TexasA&M,45,43,88,141.0,143.0,260,77.5
1,1110,518,H,WestVirginia,38,27,65,6.0,7.0,-330,5.5
2,1110,519,V,Elon,25,43,68,155.5,156.5,1875,79.0
3,1110,520,H,Duke,45,52,97,18.5,19.5,-3750,7.0
4,1110,521,V,Delaware,49,27,76,143.5,140.5,400,74.0


In [22]:
# odds2018_df.Team_v.value_counts()

In [23]:
# odds2018_df.head()

In [24]:
# odds2018_df = odds2018_df.apply(outcome, axis=1)

In [25]:
# odds2018_df.head()

In [26]:
# actual = odds2018_df.W
# vegas = odds2018_df.vegas

In [27]:
# metrics.accuracy_score(actual, vegas)

# Update all dfs then 

In [28]:
# odds2018_df.head()

In [29]:
def odds_merge_id(row):
#     row['matchup'] = ",".join(sorted([row['Team'], row['Team_v']]))
    row['ID'] = '{},{}'.format(",".join(sorted([row['Team'], row['Team_v']])), row['Date'])
    return row

In [30]:
# odds2018_df = odds2018_df.apply(odds_merge_id, axis=1)

In [31]:
# odds2018_df.head()

In [32]:
def set_up_odds_data(df_list, season_list=[2018, 2017, 2016, 2015, 2014]):
    odds_df = pd.DataFrame()
    for df, season in zip(df_list, season_list):
        df = impute_data(df)
        df = matchups(df, season)
        df = df.apply(outcome, axis=1)
        df = df.apply(odds_merge_id, axis=1)
        odds_df = odds_df.append(df, ignore_index=True)
    return odds_df

In [33]:
odds_df = set_up_odds_data(odds_dfs)

In [34]:
odds_df.Season.value_counts()

2015    3487
2016    3482
2018    3469
2017    3455
2014    3369
Name: Season, dtype: int64

In [35]:
odds_df.head()

Unnamed: 0,Date,Final,ML,Team,p,VHohe,count,Final_v,ML_v,Team_v,p_v,VHohe_v,Season,vegas,actual_spread,W,spread,month,day
0,2018-11-10,65.0,-330.0,WestVirginia,0.767442,0.0,1,88.0,260.0,TexasA&M,0.277778,1.0,2018,1,-23.0,0,-6,11,10
1,2018-11-10,97.0,-3750.0,Duke,0.974026,0.0,2,68.0,1875.0,Elon,0.050633,1.0,2018,1,29.0,1,-11,11,10
2,2018-11-10,63.0,-550.0,Richmond,0.846154,0.0,3,76.0,400.0,Delaware,0.2,1.0,2018,1,-13.0,0,-8,11,10
3,2018-11-10,57.0,-250.0,OldDominion,0.714286,0.0,4,54.0,200.0,Towson,0.333333,1.0,2018,1,3.0,1,-4,11,10
4,2018-11-10,84.0,-950.0,LoyolaChicago,0.904762,0.0,5,80.0,625.0,WrightState,0.137931,1.0,2018,1,4.0,1,-9,11,10


In [37]:
odds_df = odds_df.apply(odds_merge_id, axis=1)

In [1]:
odds_df.ID.head()

NameError: name 'odds_df' is not defined