In [1]:
# Import relevant libraries
import pandas as pd
pd.set_option('precision', 4)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

  from numpy.core.umath_tests import inner1d


In [2]:
# Read in data
datam = pd.read_stata('stubhub_crosssection.dta')
datam.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3356081 entries, 0 to 3356080
Columns: 141 entries, newgameid to RELSPRICE
dtypes: float32(38), float64(7), int16(18), int32(1), int8(71), object(6)
memory usage: 1.2+ GB


In [3]:
# Original regression:
# RELSPRICE D3to5-D81plus _I* firstrow secondrow numbrow norow piggy  homerecord* awayrecord* homegameahead homegameback
# homeaheadgtg homebackgtg awaygameahead awaygameback awaybackgtg awayaheadgtg homewildgamebackgtg homewildgameaheadgtg
# awaywildgamebackgtg awaywildgameaheadgtg homewildgameback homewildgameahead awaywildgameback awaywildgameahead EBlist  
# EBalllist EBregpnlist EBregplist SHlist SHalllist SHregpnlist SHregplist EBlistdum EBalllistdum EBregpnlistdum EBregplistdum 
# EBlist2 EBalllist2 EBregpnlist2 EBregplist2 SHlist2 SHalllist2 SHregpnlist2 SHregplist2

In [4]:
# Generate list of data from original regressions
var_list = 'RELSPRICE firstrow secondrow numbrow norow piggy homegameahead homegameback homeaheadgtg homebackgtg awaygameahead awaygameback awaybackgtg awayaheadgtg homewildgamebackgtg homewildgameaheadgtg awaywildgamebackgtg awaywildgameaheadgtg homewildgameback homewildgameahead awaywildgameback awaywildgameahead EBlist EBregpnlist EBregplist SHlist SHalllist SHregpnlist SHregplist EBlistdum EBalllistdum EBregpnlistdum EBregplistdum EBalllist2 EBregpnlist2 EBregplist2 SHlist2 SHalllist2 SHregpnlist2 SHregplist2 home away propmaxatt'
var_list = var_list.split()
var_list.extend(['D3to5', 'D6to8', 'D9to11', 'D12to14', 'D15to17', 'D18to20', 'D21to23', 'D24to26', 'D27to29', 'D30to32', 'D33to35', 'D36to38', 'D39to41', 'D42to44', 'D45to47', 'D48to50', 'D51to55', 'D56to60', 'D61to70', 'D71to80', 'D81plus'])
var_list.extend(['_Inumb_2', '_Inumb_3', '_Inumb_4', '_Inumb_5', '_Inumb_6', '_Iupto_1', '_InumXupt_2_1', '_InumXupt_3_1', '_InumXupt_4_1', '_InumXupt_5_1', '_InumXupt_6_1'])
var_list.extend(['homerecord', 'gamehomerecord', 'awayrecord', 'gameawayrecord'])

In [5]:
print(var_list)

['RELSPRICE', 'firstrow', 'secondrow', 'numbrow', 'norow', 'piggy', 'homegameahead', 'homegameback', 'homeaheadgtg', 'homebackgtg', 'awaygameahead', 'awaygameback', 'awaybackgtg', 'awayaheadgtg', 'homewildgamebackgtg', 'homewildgameaheadgtg', 'awaywildgamebackgtg', 'awaywildgameaheadgtg', 'homewildgameback', 'homewildgameahead', 'awaywildgameback', 'awaywildgameahead', 'EBlist', 'EBregpnlist', 'EBregplist', 'SHlist', 'SHalllist', 'SHregpnlist', 'SHregplist', 'EBlistdum', 'EBalllistdum', 'EBregpnlistdum', 'EBregplistdum', 'EBalllist2', 'EBregpnlist2', 'EBregplist2', 'SHlist2', 'SHalllist2', 'SHregpnlist2', 'SHregplist2', 'home', 'away', 'propmaxatt', 'D3to5', 'D6to8', 'D9to11', 'D12to14', 'D15to17', 'D18to20', 'D21to23', 'D24to26', 'D27to29', 'D30to32', 'D33to35', 'D36to38', 'D39to41', 'D42to44', 'D45to47', 'D48to50', 'D51to55', 'D56to60', 'D61to70', 'D71to80', 'D81plus', '_Inumb_2', '_Inumb_3', '_Inumb_4', '_Inumb_5', '_Inumb_6', '_Iupto_1', '_InumXupt_2_1', '_InumXupt_3_1', '_InumXupt

In [6]:
# Keep relevant data
datam = datam[var_list]
limit = datam['RELSPRICE'].quantile(0.98)
datam = datam[datam.RELSPRICE <= limit]
datam = datam.dropna()

In [7]:
# Convert multi-level dummies to hot-one encoded dummies
var_list = ['away', 'home']
for i in var_list:
    datam = pd.concat([datam,pd.get_dummies(datam[i], prefix=i,dummy_na=False)],axis=1).drop([i],axis=1)

In [8]:
# Train/test split
train_set, test_set = train_test_split(datam, test_size=0.2, random_state=21)

In [9]:
# Split train/test sets into labels and predictor matrices
cols = [col for col in datam.columns if col != 'RELSPRICE']
y_train, X_train = train_set['RELSPRICE'], train_set[cols]
y_test, X_test = test_set['RELSPRICE'], test_set[cols]

In [10]:
# Train a linear regression
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
# Assess performance of linear regression
r_2 = reg.score(X_test, y_test)
mse = ((reg.predict(X_test)-y_test)**2).mean()
print('The Linear Regression has an MSE of %0.4f on the test set.' %mse)
print('The Linear Regression has an R-squared value of %0.4f on the test set.' %r_2)

The Linear Regression has an MSE of 0.6725 on the test set.
The Linear Regression has an R-squared value of 0.3238 on the test set.


In [12]:
# # Code to check cross-validated scores (omitted in final runthrough due to runtime)
# forest = RandomForestRegressor(max_features=110, n_estimators=32)
# scores = cross_val_score(forest, X_train, y_train, cv=10)
# print(scores.mean())

In [13]:
# Train a random forest
forest = RandomForestRegressor(n_estimators=128, max_features=110, n_jobs=-1)
forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=110, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
# Assess performance of random forest
r_2 = forest.score(X_test, y_test)
mse = ((forest.predict(X_test)-y_test)**2).mean()
print('The Random Forest has an MSE of %0.4f on the test set.' %mse)
print('The Random Forest has an R-squared value of %0.4f on the test set.' %r_2)

The Random Forest has an MSE of 0.1068 on the test set.
The Random Forest has an R-squared value of 0.8926 on the test set.


In [15]:
# Generate variable importances for random forest
imps = pd.DataFrame(forest.feature_importances_,
                    index = X_train.columns,
                    columns=['importance']).sort_values('importance', ascending=False)
print('Here are some of the variables with high importance:')
imps.head(10)

Here are some of the variables with high importance:


Unnamed: 0,importance
propmaxatt,0.1561
numbrow,0.1335
gameawayrecord,0.0493
gamehomerecord,0.0488
away_NYY,0.0473
home_ChC,0.0356
SHregplist2,0.035
away_Bos,0.0347
SHregplist,0.0342
SHregpnlist,0.0272


In [16]:
# Rescale RELSPRICE to be in the unit interval
datam['relsprice'] = (datam['RELSPRICE']-datam['RELSPRICE'].min()) / (datam['RELSPRICE'].max()-datam['RELSPRICE'].min())

In [17]:
# Train/test split
train_set, test_set = train_test_split(datam, test_size=0.2, random_state=21)

In [18]:
cols = [col for col in datam.columns if col not in ['RELSPRICE', 'relsprice']]
y_train, X_train = train_set['relsprice'], train_set[cols]
y_test, X_test = test_set['relsprice'], test_set[cols]

In [19]:
# Scale covariates to standard normals for the neural network
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
# # Code to check cross-validated scores (omitted in final runthrough due to runtime)
# mlp = MLPRegressor(activation='logistic', hidden_layer_sizes=(120, 60), max_iter=1000)
# scores = cross_val_score(mlp, X_train, y_train, cv=10)
# print(scores.mean())

In [21]:
# Train neural network (multilayer perceptron)
mlp = MLPRegressor(activation='logistic', hidden_layer_sizes=(130, 65), max_iter=1000)
mlp.fit(X_train, y_train)

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(130, 65), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [22]:
# Assess performance of neural network
r_2 = mlp.score(X_test, y_test)
mse = (((mlp.predict(X_test)-y_test)*limit)**2).mean()
print('The Multilayer Perceptron has an MSE of %0.4f on the test set.' %mse)
print('The Multilayer Perceptron has an R-squared value of %0.4f on the test set.' %r_2)

The Multilayer Perceptron has an MSE of 0.3746 on the test set.
The Multilayer Perceptron has an R-squared value of 0.6243 on the test set.


# Red Sox Random Forest

In [23]:
# Keep only Red Sox games
datam = datam[(datam.home_Bos==1)|(datam.away_Bos==1)]

In [24]:
# Train/test split
train_set, test_set = train_test_split(datam, test_size=0.2, random_state=21)

In [25]:
# Re-select proper variables
cols = [col for col in datam.columns if col not in ['RELSPRICE', 'relsprice']]
y_train, X_train = train_set['RELSPRICE'], train_set[cols]
y_test, X_test = test_set['RELSPRICE'], test_set[cols]

In [26]:
# Train random forest
forest = RandomForestRegressor(n_estimators=64, max_features=100)
forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=100, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [27]:
# Assess performance of random forest
r_2 = forest.score(X_test, y_test)
mse = ((forest.predict(X_test)-y_test)**2).mean()
print('The Random Forest has an MSE of %0.4f on the test set.' %mse)
print('The Random Forest has an R-squared value of %0.4f on the test set.' %r_2)

The Random Forest has an MSE of 0.1087 on the test set.
The Random Forest has an R-squared value of 0.9013 on the test set.


In [28]:
# Generate variable importances for random forest
imps = pd.DataFrame(forest.feature_importances_,
                    index = X_train.columns,
                    columns=['importance']).sort_values('importance', ascending=False)
print('Here are some of the variables with high importance:')
imps.head(10)

Here are some of the variables with high importance:


Unnamed: 0,importance
numbrow,0.1464
away_NYY,0.1109
propmaxatt,0.0834
gameawayrecord,0.072
SHregplist,0.0605
gamehomerecord,0.0589
SHregplist2,0.0514
SHregpnlist,0.0313
SHregpnlist2,0.0295
home_NYY,0.0285
