In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

final_data = '/Users/pranav/nfl_betting_system/final_data.json'

In [2]:
df = pd.read_json(final_data)

In [3]:
df.columns

Index(['score_home', 'score_away', 'spread_favorite', 'point_diff',
       'spread_covered', 'over_under_line', 'total_points',
       'over_under_result', 'home_team_fav', 'home_win%', 'home_team_ppg',
       'home_team_oppg', 'home_p_diff', 'away_win%', 'away_team_ppg',
       'away_team_oppg', 'away_p_diff', 'home_win_exp', 'away_win_exp'],
      dtype='object')

### divide our current dataframe into two: one concerned with the Over/Under Line and the other concerned with the Spread

1) the Over/Under Line only cares about whether the teams combined points exceed the mark set by the bookkeepers, so who wins is actually irrelevant

- our O/U dataframe will house statistics related to point totals: ppg, oppg, final score, O/U line and result for home/away

2) the spread however, cares about the point differential AND the winner

- our spread dataframe will house statistics related to point differential: spread, average point diff, ppg, oppg, win %, and pythagorean expected win % for each team

In [4]:
df_ou = df[['home_team_ppg', 'home_team_oppg', 'away_team_ppg', 'away_team_oppg', 'over_under_line', 'total_points']]
df_spread = df[['score_home', 'score_away', 'spread_favorite', 'spread_covered', 
                'home_team_fav', 'home_win%', 'home_team_ppg', 'home_team_oppg', 
                'home_p_diff', 'away_win%', 'away_team_ppg', 'away_team_oppg', 
                'away_p_diff', 'home_win_exp', 'away_win_exp', 'point_diff']]

In [5]:
df_ou

Unnamed: 0,home_team_ppg,home_team_oppg,away_team_ppg,away_team_oppg,over_under_line,total_points
0,16.00,21.00,14.00,41.00,38.5,63
1,16.00,0.00,27.00,7.00,36.5,75
2,16.00,13.00,16.00,20.00,38.5,45
3,36.00,41.00,36.00,28.00,47.0,56
4,14.00,10.00,20.00,17.00,43.0,25
...,...,...,...,...,...,...
4778,25.87,18.80,17.27,18.60,36.0,40
4779,26.40,13.20,18.60,31.33,45.5,51
4780,21.60,27.80,23.40,22.47,44.5,51
4781,25.60,24.80,30.20,19.27,47.0,47


In [10]:
def load_dataset(path):
    df = pd.read_json(path)
    df_ou = df[['home_team_ppg', 'home_team_oppg', 'away_team_ppg', 'away_team_oppg', 'over_under_line']]
    df_ou['total_points'] = df['score_home']+df['score_away']
    df_spread = df[['score_home', 'score_away', 'spread_favorite', 'spread_covered', 
                    'home_team_fav', 'home_win%', 'home_team_ppg', 'home_team_oppg', 
                    'home_p_diff', 'away_win%', 'away_team_ppg', 'away_team_oppg', 
                    'away_p_diff', 'home_win_exp', 'away_win_exp']]
    dataset1 = df_ou.values
    X = dataset1[:, :-1]
    y = dataset1[:,-1]
    
    y = y.reshape((len(y), 1))
    
    return X, y

def preprocess(data_path):
    X, y = load_dataset(data_path)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    print('Training Sizes:', X_train.shape,'and', y_train.shape)
    print('Testing Sizes:', X_test.shape, 'and', y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = preprocess(final_data)

Training Sizes: (3348, 5) and (3348, 1)
Testing Sizes: (1435, 5) and (1435, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [12]:
with open('/Users/pranav/nfl_betting_system/nn_ou_train_data.pkl', 'wb') as f:
    pickle.dump((X_train, y_train), f)
with open('/Users/pranav/nfl_betting_system/nn_ou_test_data.pkl', 'wb') as f:
    pickle.dump((X_test, y_test), f)