# Goals

- Analyze the different ways to take a set of features and come up with a spread for the most effective method.

In [2]:
import CFBScrapy as cfb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import altair as alt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

pd.set_option('max_colwidth', 40)
pd.set_option('display.max_colwidth', 999)
pd.set_option('display.max_columns', 500)
pd.options.mode.chained_assignment = None
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [63]:
import time

betting_df = pd.DataFrame()

for year in range(2010, 2020):
    run_again = True
    while run_again == True:
        try:
            temp_df = cfb.get_betting_lines(year)
            betting_df = pd.concat([temp_df, betting_df])
            run_again = False
            time.sleep(5)
        except:
            print(f'{year} is running again')
            time.sleep(5)
    
betting_df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  betting_df = pd.concat([temp_df, betting_df])


Unnamed: 0,awayScore,awayTeam,formattedSpread,homeTeam,id,overUnder,provider,spread
0,10.0,Montana State,Texas Tech -28,Texas Tech,401112139,60.0,consensus,-28.0
1,28.0,Virginia Tech,Virginia Tech -4.5,Boston College,401112435,57.0,Caesars,4.5
2,28.0,Virginia Tech,Virginia Tech -4.5,Boston College,401112435,57.5,consensus,4.5
3,28.0,Virginia Tech,Virginia Tech -4.5,Boston College,401112435,58.0,numberfire,4.5
4,28.0,Virginia Tech,Virginia Tech -4.5,Boston College,401112435,56.5,teamrankings,4.5


In [64]:
betting_df = betting_df.loc[(betting_df['provider'] == 'consensus') & (betting_df['spread'].notna()), ['id','homeTeam','awayTeam','provider','spread']]
betting_df['expected_mov'] = betting_df['spread'].astype(float)*-1
betting_df.head()

Unnamed: 0,id,homeTeam,awayTeam,provider,spread,expected_mov
0,401112139,Texas Tech,Montana State,consensus,-28.0,28.0
2,401112435,Boston College,Virginia Tech,consensus,4.5,-4.5
5,401117495,New Mexico,Sam Houston State,consensus,-6.5,6.5
7,401110774,Auburn,Tulane,consensus,-16.5,16.5
10,401110775,Florida,UT Martin,consensus,-44.5,44.5


In [5]:
final_feature_df = pd.read_csv('data/final_feature_df.csv', index_col=0)
final_feature_df.head()

Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov
0,401110723,Florida,Miami,24.0,20.0,1,0.449245,0.479037,7.473268,5.862458,0.328755,0.400157,5.471815,5.396511,2.304169,1.580285,0.359037,0.412144,5.978995,6.058389,0.362297,0.319819,5.643894,4.196408,1.916273,1.267915,4.0
1,401114164,Hawai'i,Arizona,45.0,38.0,1,0.450221,0.414831,7.405083,4.956263,0.425115,0.424116,6.642327,5.498319,2.313986,2.437262,0.428806,0.412259,8.103085,5.857999,0.38835,0.400334,6.709464,4.743486,2.060399,1.824204,7.0
2,401117854,Cincinnati,UCLA,24.0,14.0,1,0.463586,0.454475,7.550123,5.747169,0.339119,0.317832,5.720966,4.095549,2.39657,1.347374,0.435692,0.421305,6.461063,4.727073,0.485208,0.454091,8.802197,5.265831,1.829067,2.52398,10.0
3,401111653,Clemson,Georgia Tech,52.0,14.0,1,0.475367,0.498426,7.007454,7.129284,0.257366,0.315316,3.600908,3.261917,2.747362,0.677888,0.406822,0.508947,10.922148,6.170982,0.47329,0.489401,7.405585,5.078938,2.633646,2.261018,38.0
4,401114236,Tulane,Florida International,42.0,14.0,1,0.334242,0.359626,6.580606,4.904085,0.368003,0.388728,7.074442,4.873563,1.531288,1.96482,0.465469,0.340401,8.517822,5.233876,0.40399,0.461811,6.554684,5.548609,2.553723,1.895304,28.0


In [65]:
betting_df.rename(columns={'id':'game_id'}, inplace=True)
betting_df = betting_df.merge(final_feature_df[['game_id','home_mov']])
betting_df.head()

Unnamed: 0,game_id,homeTeam,awayTeam,provider,spread,expected_mov,home_mov
0,401112435,Boston College,Virginia Tech,consensus,4.5,-4.5,7.0
1,401110774,Auburn,Tulane,consensus,-16.5,16.5,18.0
2,401110775,Florida,UT Martin,consensus,-44.5,44.5,45.0
3,401117858,Arkansas State,SMU,consensus,-1.5,1.5,-7.0
4,401112106,Kansas State,Nicholls,consensus,-20.5,20.5,35.0


In [6]:
x_cols = ['off_pass_success_rate_home', 'off_rush_success_rate_home',
     'off_pass_ypp_home', 'off_rush_ypp_home',
     'pass_success_rate_allowed_home', 'rush_success_rate_allowed_home',
     'pass_ypp_allowed_home', 'rush_ypp_allowed_home', 'av_ppd_home',
     'av_ppd_allowed_home', 'off_pass_success_rate_away',
     'off_rush_success_rate_away', 'off_pass_ypp_away', 'off_rush_ypp_away',
     'pass_success_rate_allowed_away', 'rush_success_rate_allowed_away',
     'pass_ypp_allowed_away', 'rush_ypp_allowed_away', 'av_ppd_away',
     'av_ppd_allowed_away']
y_cols = ['home_mov']

le = LabelEncoder()


x = final_feature_df.loc[:, x_cols]
y = final_feature_df.loc[:, y_cols]
y = le.fit_transform(y)
final_feature_df['encoded_mov'] = y
final_feature_df.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov,encoded_mov
0,401110723,Florida,Miami,24.0,20.0,1,0.449245,0.479037,7.473268,5.862458,0.328755,0.400157,5.471815,5.396511,2.304169,1.580285,0.359037,0.412144,5.978995,6.058389,0.362297,0.319819,5.643894,4.196408,1.916273,1.267915,4.0,64
1,401114164,Hawai'i,Arizona,45.0,38.0,1,0.450221,0.414831,7.405083,4.956263,0.425115,0.424116,6.642327,5.498319,2.313986,2.437262,0.428806,0.412259,8.103085,5.857999,0.38835,0.400334,6.709464,4.743486,2.060399,1.824204,7.0,67
2,401117854,Cincinnati,UCLA,24.0,14.0,1,0.463586,0.454475,7.550123,5.747169,0.339119,0.317832,5.720966,4.095549,2.39657,1.347374,0.435692,0.421305,6.461063,4.727073,0.485208,0.454091,8.802197,5.265831,1.829067,2.52398,10.0,70
3,401111653,Clemson,Georgia Tech,52.0,14.0,1,0.475367,0.498426,7.007454,7.129284,0.257366,0.315316,3.600908,3.261917,2.747362,0.677888,0.406822,0.508947,10.922148,6.170982,0.47329,0.489401,7.405585,5.078938,2.633646,2.261018,38.0,98
4,401114236,Tulane,Florida International,42.0,14.0,1,0.334242,0.359626,6.580606,4.904085,0.368003,0.388728,7.074442,4.873563,1.531288,1.96482,0.465469,0.340401,8.517822,5.233876,0.40399,0.461811,6.554684,5.548609,2.553723,1.895304,28.0,88


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=420)

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)


I tested these with a number of rounds, but this is the one that minimized eval and training log loss

In [8]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'multi:softmax', 'eval_metric':'mlogloss', 'num_class':135}

evallist = [(dtest, 'eval'), (dtrain, 'train')]

numround=2

softmax_model = xgb.train(param, dtrain, numround, evallist)

[0]	eval-mlogloss:4.67592	train-mlogloss:4.23321
[1]	eval-mlogloss:4.58345	train-mlogloss:3.84268


In [9]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'multi:softprob', 'eval_metric':'mlogloss', 'num_class':135}

evallist = [(dtest, 'eval'), (dtrain, 'train')]

numround=2

softprob_model = xgb.train(param, dtrain, numround, evallist)

[0]	eval-mlogloss:4.67592	train-mlogloss:4.23321
[1]	eval-mlogloss:4.58345	train-mlogloss:3.84268


In [10]:
dreal = xgb.DMatrix(x, label=y)

# Softmax Predictions
- USe softmax to evaluate the most likely outcome, treat that as the spread

In [122]:
ref_mov = final_feature_df[['home_mov','encoded_mov']].drop_duplicates()
ref_mov.head()

Unnamed: 0,home_mov,encoded_mov
0,4.0,64
1,7.0,67
2,10.0,70
3,38.0,98
4,28.0,88


In [123]:
softmax_df = final_feature_df.copy()
softmax_pred = pd.DataFrame(softmax_model.predict(dreal))
softmax_pred.columns=['encoded_predicted_mov']
softmax_pred = softmax_pred.merge(ref_mov, left_on=['encoded_predicted_mov'], right_on=['encoded_mov'])
softmax_df['predicted_mov'] = softmax_pred['home_mov']
softmax_df.head()

Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov,encoded_mov,predicted_mov
0,401110723,Florida,Miami,24.0,20.0,1,0.449245,0.479037,7.473268,5.862458,0.328755,0.400157,5.471815,5.396511,2.304169,1.580285,0.359037,0.412144,5.978995,6.058389,0.362297,0.319819,5.643894,4.196408,1.916273,1.267915,4.0,64,3.0
1,401114164,Hawai'i,Arizona,45.0,38.0,1,0.450221,0.414831,7.405083,4.956263,0.425115,0.424116,6.642327,5.498319,2.313986,2.437262,0.428806,0.412259,8.103085,5.857999,0.38835,0.400334,6.709464,4.743486,2.060399,1.824204,7.0,67,3.0
2,401117854,Cincinnati,UCLA,24.0,14.0,1,0.463586,0.454475,7.550123,5.747169,0.339119,0.317832,5.720966,4.095549,2.39657,1.347374,0.435692,0.421305,6.461063,4.727073,0.485208,0.454091,8.802197,5.265831,1.829067,2.52398,10.0,70,3.0
3,401111653,Clemson,Georgia Tech,52.0,14.0,1,0.475367,0.498426,7.007454,7.129284,0.257366,0.315316,3.600908,3.261917,2.747362,0.677888,0.406822,0.508947,10.922148,6.170982,0.47329,0.489401,7.405585,5.078938,2.633646,2.261018,38.0,98,3.0
4,401114236,Tulane,Florida International,42.0,14.0,1,0.334242,0.359626,6.580606,4.904085,0.368003,0.388728,7.074442,4.873563,1.531288,1.96482,0.465469,0.340401,8.517822,5.233876,0.40399,0.461811,6.554684,5.548609,2.553723,1.895304,28.0,88,3.0


# SoftProb Predictions
- Determine the probability of every outcome, treat the point where 50% of outcomes are below and above as the "spread"

In [12]:
softprob_probs = pd.DataFrame(softprob_model.predict(dreal))
softprob_probs['game_id'] = final_feature_df['game_id']
softprob_probs.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,game_id
0,0.001706,0.001706,0.002994,0.001708,0.001956,0.001707,0.003197,0.001713,0.001707,0.001809,0.00171,0.001708,0.001735,0.001735,0.001713,0.001804,0.002377,0.001762,0.001707,0.003254,0.003236,0.001736,0.001797,0.001867,0.001786,0.003919,0.002719,0.001751,0.003751,0.017576,0.002233,0.003851,0.002024,0.004268,0.002021,0.002,0.002956,0.006201,0.003233,0.023857,0.003631,0.003986,0.002347,0.005679,0.003849,0.013576,0.002512,0.015789,0.003917,0.00294,0.009427,0.015005,0.002574,0.008217,0.00771,0.010311,0.010709,0.007325,0.010784,0.014891,0.025908,0.010175,0.003328,0.222143,0.021915,0.025074,0.012656,0.034525,0.008586,0.003236,0.023648,0.011046,0.004656,0.005006,0.014873,0.006791,0.006786,0.01464,0.010285,0.005516,0.005696,0.023484,0.005858,0.003253,0.011899,0.006764,0.002321,0.004772,0.005206,0.01255,0.005845,0.005146,0.005458,0.007919,0.005897,0.027847,0.001882,0.002436,0.006591,0.003672,0.001831,0.002301,0.015147,0.002024,0.001886,0.002162,0.003259,0.001964,0.004046,0.005302,0.001738,0.002981,0.003867,0.002326,0.001708,0.002004,0.001769,0.001709,0.001707,0.001709,0.001707,0.001707,0.001708,0.001707,0.001707,0.001707,0.001709,0.001707,0.001707,0.001707,0.00171,0.001706,0.001707,0.001707,0.001706,401110723
1,0.001615,0.001615,0.001617,0.001618,0.001853,0.001616,0.001614,0.010696,0.001616,0.001713,0.00162,0.001618,0.001643,0.001643,0.001849,0.001709,0.002251,0.001669,0.001616,0.003082,0.001734,0.001645,0.001702,0.003092,0.002407,0.001856,0.004641,0.001658,0.001655,0.003135,0.002115,0.007247,0.009002,0.015296,0.001914,0.001894,0.006145,0.005873,0.009252,0.00422,0.008428,0.009516,0.01231,0.012003,0.02223,0.016494,0.004842,0.014955,0.00371,0.002057,0.00437,0.005323,0.002438,0.018305,0.154342,0.010855,0.009735,0.017664,0.02472,0.014105,0.007008,0.009638,0.013056,0.047929,0.020758,0.007041,0.011988,0.114148,0.004005,0.003065,0.022399,0.007901,0.002406,0.004641,0.007642,0.005879,0.006428,0.003793,0.009742,0.005225,0.003939,0.02334,0.005548,0.014979,0.004915,0.0078,0.002198,0.002338,0.004995,0.007058,0.002337,0.004874,0.005169,0.002723,0.005586,0.013089,0.001783,0.002307,0.003693,0.002375,0.001735,0.00218,0.002463,0.001917,0.001787,0.002048,0.001821,0.001861,0.002083,0.002231,0.001646,0.001669,0.001826,0.001656,0.001618,0.001898,0.005552,0.001618,0.001617,0.001619,0.001616,0.001616,0.001618,0.001617,0.001616,0.001616,0.001619,0.001616,0.001616,0.001616,0.00162,0.001615,0.001616,0.001616,0.001615,401114164
2,0.001046,0.001046,0.001048,0.001048,0.001049,0.001047,0.001046,0.001051,0.001047,0.00111,0.001049,0.001048,0.001064,0.001064,0.001051,0.001107,0.001225,0.001081,0.001047,0.001304,0.001123,0.001065,0.001102,0.001145,0.001095,0.001202,0.001668,0.001074,0.001072,0.001165,0.00137,0.002362,0.001241,0.002618,0.001301,0.001227,0.001813,0.003804,0.00122,0.001253,0.001895,0.001517,0.00144,0.003484,0.002361,0.001606,0.001352,0.003836,0.002403,0.001332,0.002831,0.003448,0.001579,0.00129,0.00473,0.006067,0.003095,0.015094,0.01601,0.001487,0.004539,0.003307,0.002042,0.031042,0.013444,0.005214,0.003307,0.021179,0.019293,0.001985,0.022425,0.029606,0.019597,0.004923,0.042929,0.003808,0.003249,0.01605,0.006006,0.012103,0.019158,0.056036,0.006616,0.001995,0.01861,0.095156,0.001424,0.006073,0.044444,0.007699,0.003585,0.003257,0.003348,0.001324,0.003618,0.006868,0.001155,0.001494,0.004043,0.00315,0.001123,0.011075,0.041197,0.015652,0.003407,0.11789,0.025502,0.008829,0.005631,0.019846,0.016586,0.001081,0.017517,0.004967,0.001048,0.00123,0.005112,0.001048,0.001047,0.001048,0.001047,0.001047,0.001048,0.001047,0.001047,0.001047,0.001049,0.001047,0.001047,0.001047,0.001049,0.001046,0.001047,0.001047,0.001046,401117854
3,0.000958,0.000958,0.000959,0.011269,0.001099,0.000958,0.004713,0.000962,0.001955,0.001347,0.00201,0.001181,0.000974,0.001739,0.000962,0.002017,0.001335,0.002013,0.000958,0.00149,0.001447,0.000975,0.001009,0.001833,0.001607,0.0011,0.001941,0.000983,0.000981,0.002187,0.001254,0.002162,0.001104,0.002397,0.002099,0.003212,0.003643,0.003482,0.001117,0.001147,0.001735,0.001817,0.001318,0.003189,0.007024,0.001155,0.001237,0.003511,0.002199,0.001337,0.010046,0.003156,0.001445,0.002568,0.00756,0.003003,0.015373,0.013816,0.014655,0.00423,0.019406,0.013424,0.003843,0.028414,0.013428,0.013356,0.003027,0.045055,0.017659,0.001817,0.020527,0.011076,0.017938,0.004506,0.006504,0.008159,0.004835,0.003156,0.005498,0.011078,0.017536,0.051292,0.021047,0.001841,0.007429,0.087101,0.001303,0.005559,0.008158,0.007047,0.006289,0.050841,0.008026,0.007002,0.003311,0.029131,0.001057,0.001368,0.093272,0.004222,0.001028,0.010138,0.008505,0.001816,0.026626,0.00164,0.015671,0.001103,0.009483,0.018166,0.001433,0.002013,0.016034,0.00678,0.002009,0.001125,0.004679,0.000959,0.000958,0.006045,0.002007,0.000958,0.000959,0.000959,0.000958,0.000958,0.006832,0.001996,0.000958,0.000958,0.00096,0.000958,0.000958,0.000958,0.000958,401111653
4,0.001352,0.001352,0.001353,0.001353,0.001355,0.001352,0.001351,0.001358,0.001352,0.001434,0.001551,0.001354,0.001754,0.001656,0.001358,0.00143,0.001583,0.001396,0.001352,0.001684,0.00145,0.001376,0.001424,0.002494,0.002014,0.001553,0.002739,0.007673,0.001385,0.001504,0.0066,0.001593,0.01072,0.012798,0.004916,0.004534,0.005142,0.004914,0.002414,0.001619,0.005587,0.007962,0.002633,0.010043,0.007689,0.013069,0.001746,0.004956,0.003104,0.001721,0.003657,0.057666,0.008649,0.013802,0.129136,0.009082,0.008145,0.0097,0.059075,0.00343,0.022533,0.052542,0.010924,0.040102,0.017368,0.003571,0.05406,0.01288,0.003351,0.00661,0.018741,0.00661,0.004322,0.003883,0.038628,0.004919,0.008206,0.005671,0.017351,0.002242,0.008428,0.019528,0.002384,0.004545,0.010484,0.002219,0.008832,0.007846,0.00418,0.005905,0.002472,0.004078,0.002311,0.001711,0.002089,0.007276,0.001492,0.00193,0.005223,0.001987,0.001451,0.001824,0.012107,0.001604,0.001495,0.003907,0.001523,0.001557,0.001743,0.001867,0.001378,0.001397,0.011284,0.001385,0.001354,0.01604,0.005663,0.001354,0.001353,0.001354,0.001352,0.001352,0.001354,0.001353,0.001352,0.001352,0.001355,0.001352,0.001352,0.001352,0.001547,0.001352,0.001352,0.001352,0.001352,401114236


In [13]:
#pivot to a series
#find the cumulative sum of the probabilities
#find the point where it switches from < 50 to > 50

columns=[i for i in range(0,135)]
probs_melted = softprob_probs.melt(id_vars=['game_id'], value_vars=columns).sort_values(by=['game_id', 'variable'])
probs_melted['cumsum_prob'] = probs_melted.groupby(['game_id']).cumsum()
probs_melted.head()


Unnamed: 0,game_id,variable,value,cumsum_prob
6367,303370193,0,0.001395,0.001395
12738,303370193,1,0.001395,0.002791
19109,303370193,2,0.001397,0.004188
25480,303370193,3,0.001397,0.005585
31851,303370193,4,0.005697,0.011283


Going to have to think of a good way to handle situations like this. 

In [15]:
probs_melted.rename(columns={'variable':'encoded_mov', 'value':'probability'}, inplace=True)
probs_decoded = probs_melted.merge(ref_mov).sort_values(by=['game_id','encoded_mov'])
probs_decoded.head()

Unnamed: 0,game_id,encoded_mov,probability,cumsum_prob,home_mov
0,303370193,0,0.001395,0.001395,-78.0
6371,303370193,1,0.001395,0.002791,-63.0
12742,303370193,2,0.001397,0.004188,-61.0
19113,303370193,3,0.001397,0.005585,-60.0
25484,303370193,4,0.005697,0.011283,-59.0


In [35]:
below_50 = probs_decoded.loc[probs_decoded['cumsum_prob'] < .5].sort_values(by=['cumsum_prob'], ascending=False).groupby(['game_id']).head(1)
above_50 = probs_decoded.loc[probs_decoded['cumsum_prob'] >= .5].sort_values(by=['cumsum_prob']).groupby(['game_id']).head(1)
above_50.head()

Unnamed: 0,game_id,encoded_mov,probability,cumsum_prob,home_mov
346853,400756936,54,0.097696,0.500002,-7.0
421164,322450333,66,0.003893,0.500003,6.0
341244,400869134,53,0.005091,0.500004,-8.0
445560,401114153,69,0.008057,0.500015,9.0
513711,400869685,80,0.003061,0.500016,20.0


In [39]:
closest_margins = below_50.append(above_50)
closest_margins.sort_values(by=['game_id']).head()

Unnamed: 0,game_id,encoded_mov,probability,cumsum_prob,home_mov
369518,303370193,58,0.072768,0.506904,-3.0
363147,303370193,57,0.015258,0.434136,-4.0
465084,303380158,73,0.004099,0.37766,13.0
471455,303380158,74,0.380109,0.75777,14.0
420488,303380259,66,0.026665,0.496187,6.0


In [41]:
closest_margins['diff_from_50'] = abs(.50 - closest_margins['cumsum_prob'])
pred_prob_spread = closest_margins.sort_values(by=['game_id', 'diff_from_50']).groupby(['game_id']).head(1)
pred_prob_spread.head()

Unnamed: 0,game_id,encoded_mov,probability,cumsum_prob,home_mov,diff_from_50
369518,303370193,58,0.072768,0.506904,-3.0,0.006904
465084,303380158,73,0.004099,0.37766,13.0,0.12234
420488,303380259,66,0.026665,0.496187,6.0,0.003813
414118,303382579,65,0.009416,0.50463,5.0,0.00463
592507,312440275,93,0.002899,0.492117,33.0,0.007883


In [43]:
closest_margins.loc[closest_margins['game_id'] == 303380158]

Unnamed: 0,game_id,encoded_mov,probability,cumsum_prob,home_mov,diff_from_50
465084,303380158,73,0.004099,0.37766,13.0,0.12234
471455,303380158,74,0.380109,0.75777,14.0,0.25777


So the above shows what could potentially be a flaw in my methodology.  
The score is far more likely to fall on 14, but technically its more likely to be under 14 based on my model, 14 is also a key number which gives me significant pause  
Not sure which one is a better way to go about it, so for now I'm going to go with my original methodology with that in mind

In [47]:
pred_prob_spread.rename(columns={'home_mov':'pred_home_mov'}, inplace=True)
softprob_df = final_feature_df.merge(pred_prob_spread[['game_id','pred_home_mov']])
softprob_df[['home_mov','pred_home_mov']].sample(10)

Unnamed: 0,home_mov,pred_home_mov
6339,14.0,3.0
6240,3.0,12.0
5488,22.0,-1.0
1647,3.0,6.0
1559,7.0,3.0
1067,-6.0,6.0
2573,7.0,13.0
4935,3.0,2.0
3269,-7.0,-7.0
1176,7.0,7.0


# Wizard Of Odds
- Use the table here: https://wizardofodds.com/games/sports-betting/nfl/

If you want to estimate the probability of winning of an underdog of more than 14.5 points, the formula is e^(-0.14324*s)/(1+e^(-0.14324*s)), where s is the point spread.  


In [81]:
spread_map = pd.read_csv('data/WOO_Calculator.csv')
spread_map.head()

Unnamed: 0,SPREAD,GAMES,WINS,ACTUAL PROBABILITY,ESTIMATED PROBABILITY,Fair Line
0,1.0,139,69,49.60%,46.40%,115
1,1.5,88,43,48.90%,44.60%,124
2,2.0,126,51,40.50%,42.90%,133
3,2.5,224,98,43.80%,41.10%,143
4,3.0,517,235,45.50%,39.40%,154


In [84]:
spread_map['probability'] = spread_map['ESTIMATED PROBABILITY'].str.replace('%','').astype(float) / 100
spread_map.head()

Unnamed: 0,SPREAD,GAMES,WINS,ACTUAL PROBABILITY,ESTIMATED PROBABILITY,Fair Line,probability
0,1.0,139,69,49.60%,46.40%,115,0.464
1,1.5,88,43,48.90%,44.60%,124,0.446
2,2.0,126,51,40.50%,42.90%,133,0.429
3,2.5,224,98,43.80%,41.10%,143,0.411
4,3.0,517,235,45.50%,39.40%,154,0.394


In [85]:
y_cols = ['home_win']

x = final_feature_df.loc[:, x_cols]
y = final_feature_df.loc[:, y_cols]
y.sample(10)

Unnamed: 0,home_win
5218,0
4670,1
3086,1
5691,1
5579,0
4938,1
3824,0
3733,1
1608,0
390,1


In [86]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=420)

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [95]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic', 'eval_metric':'logloss'}

evallist = [(dtest, 'eval'), (dtrain, 'train')]

numround=10

binary_model = xgb.train(param, dtrain, numround, evallist)

[0]	eval-logloss:0.63962	train-logloss:0.63411
[1]	eval-logloss:0.60317	train-logloss:0.60073
[2]	eval-logloss:0.59002	train-logloss:0.58267
[3]	eval-logloss:0.58643	train-logloss:0.56903
[4]	eval-logloss:0.58482	train-logloss:0.56153
[5]	eval-logloss:0.58702	train-logloss:0.55490
[6]	eval-logloss:0.58492	train-logloss:0.54896
[7]	eval-logloss:0.58296	train-logloss:0.54271
[8]	eval-logloss:0.58506	train-logloss:0.53829
[9]	eval-logloss:0.58046	train-logloss:0.53351


In [96]:
dreal = xgb.DMatrix(x, label=y)

In [97]:
binary_df = final_feature_df.copy()
binary_df['home_win_prob'] = binary_model.predict(dreal)
binary_df.head()

Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov,encoded_mov,home_win_prob
0,401110723,Florida,Miami,24.0,20.0,1,0.449245,0.479037,7.473268,5.862458,0.328755,0.400157,5.471815,5.396511,2.304169,1.580285,0.359037,0.412144,5.978995,6.058389,0.362297,0.319819,5.643894,4.196408,1.916273,1.267915,4.0,64,0.655474
1,401114164,Hawai'i,Arizona,45.0,38.0,1,0.450221,0.414831,7.405083,4.956263,0.425115,0.424116,6.642327,5.498319,2.313986,2.437262,0.428806,0.412259,8.103085,5.857999,0.38835,0.400334,6.709464,4.743486,2.060399,1.824204,7.0,67,0.710105
2,401117854,Cincinnati,UCLA,24.0,14.0,1,0.463586,0.454475,7.550123,5.747169,0.339119,0.317832,5.720966,4.095549,2.39657,1.347374,0.435692,0.421305,6.461063,4.727073,0.485208,0.454091,8.802197,5.265831,1.829067,2.52398,10.0,70,0.950693
3,401111653,Clemson,Georgia Tech,52.0,14.0,1,0.475367,0.498426,7.007454,7.129284,0.257366,0.315316,3.600908,3.261917,2.747362,0.677888,0.406822,0.508947,10.922148,6.170982,0.47329,0.489401,7.405585,5.078938,2.633646,2.261018,38.0,98,0.963734
4,401114236,Tulane,Florida International,42.0,14.0,1,0.334242,0.359626,6.580606,4.904085,0.368003,0.388728,7.074442,4.873563,1.531288,1.96482,0.465469,0.340401,8.517822,5.233876,0.40399,0.461811,6.554684,5.548609,2.553723,1.895304,28.0,88,0.401766


In [104]:
from merge_closest import merge_closest

matched_df = merge_closest(binary_df,
                              spread_map,
                              'home_win_prob',
                              'probability',
                              ['SPREAD'])
matched_df[['home_win_prob','SPREAD']].sample(10)

Unnamed: 0,home_win_prob,SPREAD
2580,0.579295,1.0
1196,0.217872,9.0
1284,0.700068,1.0
5712,0.135129,13.0
4827,0.247211,8.0
5290,0.501178,1.0
2389,0.228211,8.5
1108,0.900963,1.0
1480,0.794883,1.0
3065,0.832755,1.0


Still not working right because it doesn't go above 50%. Have to convert home win probs and away win probs seperately I guess

In [106]:
matched_df['away_win_prob'] = 1 - matched_df['home_win_prob']
matched_df.head()

Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov,encoded_mov,home_win_prob,SPREAD,away_win_prob
0,401110723,Florida,Miami,24.0,20.0,1,0.449245,0.479037,7.473268,5.862458,0.328755,0.400157,5.471815,5.396511,2.304169,1.580285,0.359037,0.412144,5.978995,6.058389,0.362297,0.319819,5.643894,4.196408,1.916273,1.267915,4.0,64,0.655474,1.0,0.344526
1,401114164,Hawai'i,Arizona,45.0,38.0,1,0.450221,0.414831,7.405083,4.956263,0.425115,0.424116,6.642327,5.498319,2.313986,2.437262,0.428806,0.412259,8.103085,5.857999,0.38835,0.400334,6.709464,4.743486,2.060399,1.824204,7.0,67,0.710105,1.0,0.289895
2,401117854,Cincinnati,UCLA,24.0,14.0,1,0.463586,0.454475,7.550123,5.747169,0.339119,0.317832,5.720966,4.095549,2.39657,1.347374,0.435692,0.421305,6.461063,4.727073,0.485208,0.454091,8.802197,5.265831,1.829067,2.52398,10.0,70,0.950693,1.0,0.049307
3,401111653,Clemson,Georgia Tech,52.0,14.0,1,0.475367,0.498426,7.007454,7.129284,0.257366,0.315316,3.600908,3.261917,2.747362,0.677888,0.406822,0.508947,10.922148,6.170982,0.47329,0.489401,7.405585,5.078938,2.633646,2.261018,38.0,98,0.963734,1.0,0.036266
4,401114236,Tulane,Florida International,42.0,14.0,1,0.334242,0.359626,6.580606,4.904085,0.368003,0.388728,7.074442,4.873563,1.531288,1.96482,0.465469,0.340401,8.517822,5.233876,0.40399,0.461811,6.554684,5.548609,2.553723,1.895304,28.0,88,0.401766,3.0,0.598234


In [118]:
favorite_df = matched_df.loc[matched_df['home_win_prob'] >= .5]
dog_df = matched_df.loc[matched_df['home_win_prob'] < .5]

favorite_df.drop(columns=['SPREAD'], inplace=True)
dog_df.drop(columns=['SPREAD'], inplace=True)

favorite_df = merge_closest(favorite_df,
                              spread_map,
                              'away_win_prob',
                              'probability',
                              ['SPREAD'])

dog_df = merge_closest(dog_df,
                       spread_map,
                       'home_win_prob',
                       'probability',
                       ['SPREAD'])

#normally - in the spread denotes favorite, but I'm trying to pick MOV which is the opposite.
dog_df['SPREAD'] = dog_df['SPREAD']*-1
favorite_df[['home_win_prob','away_win_prob','SPREAD']].sample(10)

Unnamed: 0,home_win_prob,away_win_prob,SPREAD
1027,0.965222,0.034778,
3115,0.719899,0.280101,7.0
1762,0.830124,0.169876,11.5
472,0.852738,0.147262,12.5
4422,0.652906,0.347094,4.5
414,0.801335,0.198665,10.0
2509,0.833965,0.166035,11.5
2486,0.852738,0.147262,12.5
5458,0.762385,0.237615,8.5
6264,0.603787,0.396213,3.0


In [119]:
matched_df = favorite_df.append(dog_df)
matched_df = matched_df.loc[matched_df['SPREAD'].notna()]
matched_df.head()

Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov,encoded_mov,home_win_prob,away_win_prob,SPREAD
0,401110723,Florida,Miami,24.0,20.0,1,0.449245,0.479037,7.473268,5.862458,0.328755,0.400157,5.471815,5.396511,2.304169,1.580285,0.359037,0.412144,5.978995,6.058389,0.362297,0.319819,5.643894,4.196408,1.916273,1.267915,4.0,64,0.655474,0.344526,4.5
1,401114164,Hawai'i,Arizona,45.0,38.0,1,0.450221,0.414831,7.405083,4.956263,0.425115,0.424116,6.642327,5.498319,2.313986,2.437262,0.428806,0.412259,8.103085,5.857999,0.38835,0.400334,6.709464,4.743486,2.060399,1.824204,7.0,67,0.710105,0.289895,6.5
5,401110731,Texas A&M,Texas State,41.0,7.0,1,0.464104,0.477007,6.953341,5.759984,0.386395,0.379668,7.576415,4.246205,2.417156,1.502577,0.36743,0.38468,6.341303,4.605838,0.341975,0.342094,5.874698,4.830533,1.318306,1.847403,34.0,94,0.810258,0.189742,10.5
6,401114153,Arizona State,Kent State,30.0,7.0,1,0.425472,0.400438,7.646079,4.566486,0.458641,0.441105,7.4716,4.82005,2.161378,1.962068,0.313628,0.385262,4.737434,5.195217,0.466157,0.426435,7.78547,5.867991,1.341082,2.803108,23.0,83,0.816002,0.183998,10.5
9,401112212,Michigan State,Tulsa,28.0,7.0,1,0.369541,0.35492,5.862964,4.673392,0.379469,0.309767,5.888315,3.418731,1.521891,1.296608,0.371869,0.433658,5.98322,4.748316,0.431691,0.439137,6.733501,5.762083,1.938347,2.042911,21.0,81,0.772622,0.227378,9.0


# Logistic Regression Smoothing of Probs

- Use what I believe to be the approach they used on the wizard of odds page, convert the known spreads to probability and work backwards

In [125]:
betting_df.head()

Unnamed: 0,game_id,homeTeam,awayTeam,provider,spread,expected_mov,home_mov
0,401112435,Boston College,Virginia Tech,consensus,4.5,-4.5,7.0
1,401110774,Auburn,Tulane,consensus,-16.5,16.5,18.0
2,401110775,Florida,UT Martin,consensus,-44.5,44.5,45.0
3,401117858,Arkansas State,SMU,consensus,-1.5,1.5,-7.0
4,401112106,Kansas State,Nicholls,consensus,-20.5,20.5,35.0


In [131]:
betting_df['home_win'] = (betting_df['home_mov'] > 1).replace({True:1, False:0})
betting_df.sample(5)

Unnamed: 0,game_id,homeTeam,awayTeam,provider,spread,expected_mov,home_mov,home_win
4476,332850041,Connecticut,South Florida,consensus,-6.0,6.0,-3.0,0
2858,400869401,Old Dominion,Marshall,consensus,-7.0,7.0,24.0,1
14,401110777,Kentucky,Eastern Michigan,consensus,-15.5,15.5,21.0,1
918,401012891,Michigan,Nebraska,consensus,-18.0,18.0,46.0,1
2347,400869285,Akron,Appalachian State,consensus,7.0,-7.0,-7.0,0


In [132]:
x_cols = ['expected_mov']
y_cols = ['home_win']

x = betting_df.loc[:, x_cols]
y = betting_df.loc[:, y_cols]
y.sample(5)

Unnamed: 0,home_win
1889,1
2816,1
732,0
4193,1
1311,1


In [133]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=420)

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [137]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic', 'eval_metric':'logloss'}

evallist = [(dtest, 'eval'), (dtrain, 'train')]

numround=5

spread_converter_model= xgb.train(param, dtrain, numround, evallist)

[0]	eval-logloss:0.54348	train-logloss:0.50179
[1]	eval-logloss:0.52809	train-logloss:0.48447
[2]	eval-logloss:0.52689	train-logloss:0.48075
[3]	eval-logloss:0.52409	train-logloss:0.47980
[4]	eval-logloss:0.52099	train-logloss:0.47828


In [139]:
dreal = xgb.DMatrix(x, label=y)

In [140]:
betting_df['estimated_home_win_prob'] = spread_converter_model.predict(dreal)
betting_df.head()

Unnamed: 0,game_id,homeTeam,awayTeam,provider,spread,expected_mov,home_mov,home_win,estimated_home_win_prob
0,401112435,Boston College,Virginia Tech,consensus,4.5,-4.5,7.0,1,0.460868
1,401110774,Auburn,Tulane,consensus,-16.5,16.5,18.0,1,0.90109
2,401110775,Florida,UT Martin,consensus,-44.5,44.5,45.0,1,0.983935
3,401117858,Arkansas State,SMU,consensus,-1.5,1.5,-7.0,0,0.41811
4,401112106,Kansas State,Nicholls,consensus,-20.5,20.5,35.0,1,0.90109


In [146]:
ref_spread = betting_df[['expected_mov', 'estimated_home_win_prob']].drop_duplicates().sort_values(by='expected_mov')
ref_spread.sample(10)

Unnamed: 0,expected_mov,estimated_home_win_prob
61,-2.0,0.41811
718,-20.5,0.086373
19,7.0,0.694962
613,51.5,0.983935
2166,-33.5,0.038159
116,19.0,0.90109
21,-9.0,0.229235
279,-11.5,0.174007
2030,-39.5,0.038159
245,-19.0,0.086373


In [148]:
binary_df = merge_closest(binary_df,
                          ref_spread,
                          'home_win_prob',
                          'estimated_home_win_prob',
                          ['expected_mov'])
binary_df.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov,encoded_mov,home_win_prob,expected_mov
0,401110723,Florida,Miami,24.0,20.0,1,0.449245,0.479037,7.473268,5.862458,0.328755,0.400157,5.471815,5.396511,2.304169,1.580285,0.359037,0.412144,5.978995,6.058389,0.362297,0.319819,5.643894,4.196408,1.916273,1.267915,4.0,64,0.655474,4.5
1,401114164,Hawai'i,Arizona,45.0,38.0,1,0.450221,0.414831,7.405083,4.956263,0.425115,0.424116,6.642327,5.498319,2.313986,2.437262,0.428806,0.412259,8.103085,5.857999,0.38835,0.400334,6.709464,4.743486,2.060399,1.824204,7.0,67,0.710105,10.5
2,401117854,Cincinnati,UCLA,24.0,14.0,1,0.463586,0.454475,7.550123,5.747169,0.339119,0.317832,5.720966,4.095549,2.39657,1.347374,0.435692,0.421305,6.461063,4.727073,0.485208,0.454091,8.802197,5.265831,1.829067,2.52398,10.0,70,0.950693,22.0
3,401111653,Clemson,Georgia Tech,52.0,14.0,1,0.475367,0.498426,7.007454,7.129284,0.257366,0.315316,3.600908,3.261917,2.747362,0.677888,0.406822,0.508947,10.922148,6.170982,0.47329,0.489401,7.405585,5.078938,2.633646,2.261018,38.0,98,0.963734,22.0
4,401114236,Tulane,Florida International,42.0,14.0,1,0.334242,0.359626,6.580606,4.904085,0.368003,0.388728,7.074442,4.873563,1.531288,1.96482,0.465469,0.340401,8.517822,5.233876,0.40399,0.461811,6.554684,5.548609,2.553723,1.895304,28.0,88,0.401766,-8.0


In [152]:
binary_df.loc[binary_df['expected_mov'].isna()]

Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov,encoded_mov,home_win_prob,expected_mov
602,401112255,Rutgers,Ohio State,21.0,56.0,0,0.217221,0.305819,3.261607,4.104259,0.44775,0.4429,7.381559,5.194748,0.528328,2.266709,0.527467,0.475877,8.200621,6.30385,0.305963,0.360081,4.383792,4.66242,3.492056,0.940038,-35.0,26,0.034537,
3227,400763432,Kansas,Baylor,7.0,66.0,0,0.385792,0.303694,6.064943,3.676214,0.474479,0.491259,7.649727,5.999433,1.286135,2.465641,0.482848,0.48694,9.34853,5.81559,0.412338,0.394307,7.478094,4.124899,3.268915,1.87105,-59.0,4,0.033161,
3821,400547782,Florida International,Louisville,3.0,34.0,0,0.319957,0.27123,5.811712,3.208955,0.439572,0.455039,8.228565,5.592315,0.523327,2.088403,0.483484,0.442401,7.932556,4.636121,0.387633,0.346798,6.187971,3.990644,1.844194,1.229528,-31.0,30,0.036445,
4523,332640193,Miami (OH),Cincinnati,0.0,14.0,0,0.379777,0.31688,6.627702,4.163824,0.508267,0.518282,8.198607,5.935673,1.188362,2.018407,0.474498,0.456429,8.724735,5.407581,0.375099,0.376803,6.268472,4.264315,2.175093,1.021194,-14.0,47,0.020161,
4629,332780038,Colorado,Oregon,16.0,57.0,0,0.365892,0.36136,5.915526,4.104288,0.48183,0.478032,7.811197,5.853613,0.812174,2.421148,0.499317,0.501639,7.978223,6.93661,0.358063,0.42261,5.41538,4.433322,2.240467,0.996956,-41.0,20,0.038044,
4800,333060113,UMass,Northern Illinois,19.0,63.0,0,0.335148,0.336604,5.062228,3.460185,0.516163,0.48792,8.420224,5.60839,0.512678,1.810664,0.491343,0.50049,7.773779,6.323952,0.388748,0.404134,6.73247,4.212662,2.277887,1.242402,-44.0,17,0.037433,
4842,333090193,Miami (OH),Bowling Green,3.0,45.0,0,0.360525,0.304339,5.870148,3.867462,0.487121,0.515727,7.770815,5.433245,1.13664,2.066475,0.448583,0.461545,8.024001,4.981233,0.378347,0.412132,5.713864,5.314287,1.833767,1.302032,-42.0,19,0.020161,
4913,333202247,Georgia State,Louisiana,21.0,35.0,0,0.351615,0.287714,6.571515,3.840712,0.55914,0.474001,8.649941,5.153034,0.852294,2.403545,0.532367,0.493697,9.922915,5.696785,0.455193,0.429677,7.830475,4.810748,2.353879,1.520588,-14.0,47,0.033161,
5156,322592305,Kansas,TCU,6.0,20.0,0,0.373209,0.375724,5.020557,4.09816,0.50668,0.49167,9.163887,6.011964,1.043412,2.246184,0.520877,0.491082,9.417127,5.573873,0.352922,0.321986,5.688915,3.872561,2.324726,0.965598,-14.0,47,0.038044,
5265,322730167,New Mexico,Boise State,29.0,32.0,0,0.315651,0.374191,4.800004,4.124118,0.554448,0.48707,8.481517,5.760264,0.889358,2.666354,0.534375,0.376815,7.845238,4.736019,0.322224,0.334119,6.04665,4.013368,2.059997,0.905733,-3.0,58,0.030119,


Not really sure why these came back as nan's, but ultimately not all that important for evaluating the method as a whole, Ignoring them, but if this method is best I'll have to dig more

In [153]:
binary_df = binary_df.loc[binary_df['expected_mov'].notna()]
binary_df.loc[binary_df['expected_mov'].isna()]

Unnamed: 0,game_id,home_team,away_team,home_points,away_points,home_win,off_pass_success_rate_home,off_rush_success_rate_home,off_pass_ypp_home,off_rush_ypp_home,pass_success_rate_allowed_home,rush_success_rate_allowed_home,pass_ypp_allowed_home,rush_ypp_allowed_home,av_ppd_home,av_ppd_allowed_home,off_pass_success_rate_away,off_rush_success_rate_away,off_pass_ypp_away,off_rush_ypp_away,pass_success_rate_allowed_away,rush_success_rate_allowed_away,pass_ypp_allowed_away,rush_ypp_allowed_away,av_ppd_away,av_ppd_allowed_away,home_mov,encoded_mov,home_win_prob,expected_mov


# Evaluation of all 3 and thoughts

In [156]:
from sklearn.metrics import mean_squared_error
from math import sqrt

print(f'The bookmakers spread RMSE is: {sqrt(mean_squared_error(betting_df["home_mov"], betting_df["expected_mov"].astype(float)))}')
print(f'The softprob method RMSE is: {sqrt(mean_squared_error(softprob_df["home_mov"], softprob_df["pred_home_mov"].astype(float)))}')
print(f'The softmax method RMSE is: {sqrt(mean_squared_error(softmax_df["home_mov"], softmax_df["predicted_mov"].astype(float)))}')
print(f'The wizard of odds table RMSE is: {sqrt(mean_squared_error(matched_df["home_mov"], matched_df["SPREAD"].astype(float)))}')
print(f'The Log Reg Smoothing RMSE is: {sqrt(mean_squared_error(binary_df["home_mov"], binary_df["expected_mov"].astype(float)))}')

The bookmakers spread RMSE is: 15.912081634513425
The softprob method RMSE is: 18.416760091506045
The softmax method RMSE is: 26.579821175795125
The wizard of odds table RMSE is: 18.171492657930198
The Log Reg Smoothing RMSE is: 18.700261282891745


-  Looks like the wizard of odds way has the best RMSE at the outset, however, it doesn't have the ability to predict spreads larger than 14 which is common in CFB
- I think the softprob way has a lot of promise since I can probably reduce the RMSE this somewhat using the idea I mentioned in the softprob section, but the softmax method was far and away the worst at predicting score which is all I was really curious about for starters.
- I should keep brainstorming how to handle the differences in probabilities from the softprob section and improve it I think it has more promise
- https://wizardofodds.com/games/sports-betting/nfl/ looking at it again, this site seems to suggest that you just create a logistic regression model using the actual spread to convert it to a probability? I probably should've thought of that...adding that method now  
* That method shows promise also, but also fell behind the softprob method


Making some charts now to see if there's anything that isn't obvious from the numbers


In [167]:
#using the json to transformer to make the nb small enough to upload to github
alt.data_transformers.enable('json')


DataTransformerRegistry.enable('json')

In [168]:
spread_chart = alt.Chart(betting_df).mark_circle().encode(
    x='expected_mov',
    y='home_mov'
).properties(
    title='Spread'
)

softprob_chart = alt.Chart(softprob_df).mark_circle().encode(
    x='pred_home_mov',
    y='home_mov'
).properties(
    title='Softprob'
)

softmax = alt.Chart(softmax_df).mark_circle().encode(
    x='predicted_mov',
    y='home_mov'
).properties(
    title='Softmax'
)

woo = alt.Chart(matched_df).mark_circle().encode(
    x='SPREAD',
    y='home_mov'
).properties(
    title='WOO'
)

log_reg = alt.Chart(binary_df).mark_circle().encode(
    x='expected_mov',
    y='home_mov'
).properties(
    title='Smoothing'
)

spread_chart = spread_chart + spread_chart.transform_loess('expected_mov', 'home_mov').mark_line(color='red')
softprob_chart = softprob_chart + softprob_chart.transform_loess('pred_home_mov', 'home_mov').mark_line(color='red')
softmax = softmax + softmax.transform_loess('predicted_mov', 'home_mov').mark_line(color='red')
woo = woo + woo.transform_loess('SPREAD', 'home_mov').mark_line(color='red')
log_reg = log_reg + log_reg.transform_loess('expected_mov', 'home_mov').mark_line(color='red')

total_chart = spread_chart | softprob_chart | softmax | woo | log_reg
total_chart

Again I think I'm partial to the softprob method based on those charts. I think it can be cleaned up somewhat, but its most promising from a brief once over