## Preprocessing and Feature Engineering

### Importing Data Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import r2_score
from data_preprocessing import *

%matplotlib inline

- #### Importing Clean DataFrame

In [2]:
betting_df = pd.read_csv('../Data/betting_df')

## Creating Different Feature Interactions

- #### Creating a total points and a  score difference column 
     - Will be used to see if the points total went over or under the expected outcome

In [3]:
betting_df['score_difference'] = betting_df['score_home'] - betting_df['score_away']
betting_df['total_points'] = betting_df['score_home'] + betting_df['score_away']

- #### This column will show if the result of the game was over, under, or a push on the expected total points

In [4]:
betting_df['over_under_result'] = np.where(betting_df['total_points']>betting_df['over/under'], 
                                           'over', 'under')
betting_df['over_under_result'] = np.where(betting_df['total_points']==betting_df['over/under'], 
                                           'push', betting_df['over_under_result'])

In [5]:
betting_df.head()

Unnamed: 0,date,season_year,week,home_team,away_team,stadium,team_favorite,spread,over/under,stadium_type,...,total_score_over_min,total_score_over_max,total_score_over_close,total_score_under_open,total_score_under_min,total_score_under_max,total_score_under_close,score_difference,total_points,over_under_result
0,2006-09-07,2006,1,Pittsburgh Steelers,Miami Dolphins,Heinz Field,PIT,-1.5,34.5,na,...,1.907416,1.997706,1.950843,1.948511,1.936685,1.963408,1.95441,11,45,over
1,2006-09-10,2006,1,Arizona Cardinals,San Francisco 49ers,University of Phoenix Stadium,ARI,-9.5,44.0,DOME,...,1.907416,1.997706,1.950843,1.948511,1.936685,1.963408,1.95441,7,61,over
2,2006-09-10,2006,1,Carolina Panthers,Atlanta Falcons,Bank of America Stadium,CAR,-4.5,39.0,na,...,1.907416,1.997706,1.950843,1.948511,1.936685,1.963408,1.95441,-14,26,under
3,2006-09-10,2006,1,Cleveland Browns,New Orleans Saints,FirstEnergy Stadium,CLE,-3.0,36.0,na,...,1.907416,1.997706,1.950843,1.948511,1.936685,1.963408,1.95441,-5,33,under
4,2006-09-10,2006,1,Detroit Lions,Seattle Seahawks,Ford Field,SEA,-6.0,44.0,DOME,...,1.907416,1.997706,1.950843,1.948511,1.936685,1.963408,1.95441,-3,15,under


- #### Creating a dummy column for dome stadiums and non dome stadiums

In [6]:
betting_df = pd.get_dummies(betting_df, columns=['stadium_type'])

In [7]:
betting_df.columns = [i.lower().replace(' ', '_') for i in betting_df.columns]

In [8]:
betting_df['stadium_type_na'].sum()

2294

In [9]:
#dropping dummy NA column
betting_df = betting_df.drop(columns=['stadium_type_na'])

- #### Making sure the team favorite column have the same corresponding syntax for teams that have moved cities


In [10]:
betting_df.loc[betting_df.team_favorite =='SD','team_favorite'] = 'LAC'
betting_df.loc[betting_df.team_favorite =='SLR','team_favorite'] = 'LAR'

- #### Dictionary of all NFL teams then mapping the full name to the teams abbreviation using mapping

In [11]:
nfl_teams = {'San Diego Chargers': 'LAC', 'Indianapolis Colts': 'IND','Washington Redskins': 'WAS', 'Miami Dolphins': 
'MIA', 'Buffalo Bills': 'BUF','Tennessee Oilers': 'TEN', 'Minnesota Vikings': 'MIN', 'Atlanta Falcons': 'ATL',
'New Orleans Saints': 'NO', 'Pittsburgh Steelers': 'PIT','Kansas City Chiefs': 'KC', 'Denver Broncos': 'DEN', 
'New York Giants': 'NYG', 'Houston Oilers': 'TEN','Cleveland Browns': 'CLE', 'Philadelphia Eagles': 'PHI',
'Dallas Cowboys': 'DAL','Arizona Cardinals': 'ARI', 'Chicago Bears': 'CHI', 'New England Patriots': 'NE',
'San Francisco 49ers': 'SF','Tampa Bay Buccaneers': 'TB','Seattle Seahawks': 'SEA','Baltimore Ravens': 'BAL', 
'Jacksonville Jaguars': 'JAX','Carolina Panthers': 'CAR', 'Green Bay Packers': 'GB','St. Louis Cardinals': 'ARI',
'Los Angeles Raiders': 'OAK','Detroit Lions': 'DET','Cincinnati Bengals': 'CIN','Los Angeles Rams': 'LAR',
'Oakland Raiders': 'OAK','Houston Texans': 'HOU', 'Tennessee Titans': 'TEN','New York Jets': 'NYJ',
    'St. Louis Rams': 'LAR', 'Baltimore Colts': 'IND', 'Los Angeles Chargers': 'LAC',
'Phoenix Cardinals': 'ARI'}
nfl_teams

{'San Diego Chargers': 'LAC',
 'Indianapolis Colts': 'IND',
 'Washington Redskins': 'WAS',
 'Miami Dolphins': 'MIA',
 'Buffalo Bills': 'BUF',
 'Tennessee Oilers': 'TEN',
 'Minnesota Vikings': 'MIN',
 'Atlanta Falcons': 'ATL',
 'New Orleans Saints': 'NO',
 'Pittsburgh Steelers': 'PIT',
 'Kansas City Chiefs': 'KC',
 'Denver Broncos': 'DEN',
 'New York Giants': 'NYG',
 'Houston Oilers': 'TEN',
 'Cleveland Browns': 'CLE',
 'Philadelphia Eagles': 'PHI',
 'Dallas Cowboys': 'DAL',
 'Arizona Cardinals': 'ARI',
 'Chicago Bears': 'CHI',
 'New England Patriots': 'NE',
 'San Francisco 49ers': 'SF',
 'Tampa Bay Buccaneers': 'TB',
 'Seattle Seahawks': 'SEA',
 'Baltimore Ravens': 'BAL',
 'Jacksonville Jaguars': 'JAX',
 'Carolina Panthers': 'CAR',
 'Green Bay Packers': 'GB',
 'St. Louis Cardinals': 'ARI',
 'Los Angeles Raiders': 'OAK',
 'Detroit Lions': 'DET',
 'Cincinnati Bengals': 'CIN',
 'Los Angeles Rams': 'LAR',
 'Oakland Raiders': 'OAK',
 'Houston Texans': 'HOU',
 'Tennessee Titans': 'TEN',
 'Ne

In [12]:
betting_df['home_id'] = betting_df.home_team.map(nfl_teams)
betting_df['away_id'] = betting_df.away_team.map(nfl_teams)

In [13]:
betting_df.head()

Unnamed: 0,date,season_year,week,home_team,away_team,stadium,team_favorite,spread,over/under,weather_temperature,...,over_under_result,stadium_type_dome,stadium_type_fog,stadium_type_rain,stadium_type_rain_|_fog,stadium_type_snow,stadium_type_snow_|_fog,stadium_type_snow_|_freezing_rain,home_id,away_id
0,2006-09-07,2006,1,Pittsburgh Steelers,Miami Dolphins,Heinz Field,PIT,-1.5,34.5,62.0,...,over,0,0,0,0,0,0,0,PIT,MIA
1,2006-09-10,2006,1,Arizona Cardinals,San Francisco 49ers,University of Phoenix Stadium,ARI,-9.5,44.0,72.0,...,over,1,0,0,0,0,0,0,ARI,SF
2,2006-09-10,2006,1,Carolina Panthers,Atlanta Falcons,Bank of America Stadium,CAR,-4.5,39.0,73.0,...,under,0,0,0,0,0,0,0,CAR,ATL
3,2006-09-10,2006,1,Cleveland Browns,New Orleans Saints,FirstEnergy Stadium,CLE,-3.0,36.0,65.0,...,under,0,0,0,0,0,0,0,CLE,NO
4,2006-09-10,2006,1,Detroit Lions,Seattle Seahawks,Ford Field,SEA,-6.0,44.0,72.0,...,under,1,0,0,0,0,0,0,DET,SEA


- #### Making a column for whether or not the home team is the favorite

In [14]:
betting_df['home_favorite'] = 0
betting_df.loc[betting_df.home_id == betting_df.team_favorite,'home_favorite'] = 1
betting_df.head()

Unnamed: 0,date,season_year,week,home_team,away_team,stadium,team_favorite,spread,over/under,weather_temperature,...,stadium_type_dome,stadium_type_fog,stadium_type_rain,stadium_type_rain_|_fog,stadium_type_snow,stadium_type_snow_|_fog,stadium_type_snow_|_freezing_rain,home_id,away_id,home_favorite
0,2006-09-07,2006,1,Pittsburgh Steelers,Miami Dolphins,Heinz Field,PIT,-1.5,34.5,62.0,...,0,0,0,0,0,0,0,PIT,MIA,1
1,2006-09-10,2006,1,Arizona Cardinals,San Francisco 49ers,University of Phoenix Stadium,ARI,-9.5,44.0,72.0,...,1,0,0,0,0,0,0,ARI,SF,1
2,2006-09-10,2006,1,Carolina Panthers,Atlanta Falcons,Bank of America Stadium,CAR,-4.5,39.0,73.0,...,0,0,0,0,0,0,0,CAR,ATL,1
3,2006-09-10,2006,1,Cleveland Browns,New Orleans Saints,FirstEnergy Stadium,CLE,-3.0,36.0,65.0,...,0,0,0,0,0,0,0,CLE,NO,1
4,2006-09-10,2006,1,Detroit Lions,Seattle Seahawks,Ford Field,SEA,-6.0,44.0,72.0,...,1,0,0,0,0,0,0,DET,SEA,0
