In [1]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

# Empirical validation

In [2]:
nsamp = 1000000
prob = 0.524

real = (2.*np.random.randint(2,size=nsamp) - 1.).astype(int)
pred_inds = np.random.choice(range(nsamp), size=int((1.-prob)*nsamp), replace=False)

pred = 1.*real
pred[pred_inds] *= -1

In [3]:
tmp = accuracy_score(real,pred)
print('predictive accuracy: ', tmp)

predictive accuracy:  0.524


In [4]:
tmp = r2_score(real, pred*(real@pred)/(pred@pred))
print('predictive R^2: ', tmp)

predictive R^2:  0.002303358277529366


In [5]:
tmp = np.sqrt(r2_score(real, pred*(real@pred)/(pred@pred)))
print('predictive sqrt(R^2): ', tmp)

predictive sqrt(R^2):  0.047993314925407746


In [6]:
tmp = (pred@real) / np.sqrt( (pred@pred)*(real@real))
print('predictive correlation: ', tmp)

predictive correlation:  0.048


# Predictive $R^2$ required to consistently beat the NFL spread

Since $std(pred) = std(real) = 1$, we have $corr(pred,real) = cov(pred,real) = (np - n(1-p))/n = 2p - 1$, where p is the predictive accuracy.

Therefore, the predictive accuracy is given by $p = 0.5 + corr(pred,real)/2$.

A predictive accuracy threshold of $p = 52.4\%$ is needed to profit in the NFL spread market on expectation, corresponding to a predictive correlation of 4.8% and predictive $R^2$ of 0.23%.