In [1]:
import pandas as pd
import numpy as np
import theano.tensor as tt
import pymc3 as pm
import seaborn as sns
import matplotlib.pyplot as plt
import arviz as az
from Clean import Clean

%matplotlib inline

# Data Prep

In [6]:
n = 100

df = pd.read_csv('FM_2000-2019.csv')
print(df.shape)
df_all = df[df['gp_all_0_a'] >= 30]
df = df_all[0:n]
df_star = df_all[-n:]
print(df.shape)
print(df_star.shape)

(24171, 459)
(100, 459)
(100, 459)


In [7]:
games = 30
q = 1

clean = Clean(df,games)
features = clean.get_features(['e-def-rating','e-off-rating','e-pace'],q)
y = clean.get_target(q).values
cols = features.columns
x = features.values
print(x.shape, y.shape)

clean_test = Clean(df_star,games)
features_test = clean_test.get_features(['e-def-rating','e-off-rating','e-pace'],q)
y_star = clean_test.get_target(q).values
cols_test = features_test.columns
x_star = features_test.values
print(x_star.shape, y_star.shape)

(90, 24) (90,)
(100, 24) (100,)


# Fit a Gaussian process

In [8]:
with pm.Model() as model:
    # ls = pm.Normal('ls', mu=0, sigma=1, shape=x.shape[1])
    ls = pm.HalfNormal('ls', sigma=1, shape=x.shape[1])
    cov = pm.gp.cov.ExpQuad(input_dim=x.shape[1], ls=ls)
    gp = pm.gp.Marginal(cov_func=cov)
    
    noise = pm.HalfNormal('noise', sigma=1)
    y_ = gp.marginal_likelihood("y", X=x, y=y, noise=noise)

  result[diagonal_slice] = x


In [9]:
# Get trace
with model:
    # trace = pm.sample(1000, tune=3000, chains=2)
    trace = pm.sample(cores=1)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [noise, ls]
  0%|          | 0/1000 [00:00<?, ?it/s]  1%|          | 9/1000 [00:00<00:11, 84.93it/s]  1%|          | 12/1000 [00:00<00:21, 45.17it/s]  2%|▏         | 15/1000 [00:00<00:30, 32.06it/s]  2%|▏         | 18/1000 [00:00<00:32, 30.37it/s]  2%|▎         | 25/1000 [00:00<00:29, 32.73it/s]  3%|▎         | 28/1000 [00:00<00:35, 27.49it/s]  3%|▎         | 33/1000 [00:00<00:30, 31.42it/s]  4%|▎         | 37/1000 [00:01<01:06, 14.39it/s]  4%|▍         | 40/1000 [00:02<02:01,  7.92it/s]  4%|▍         | 42/1000 [00:02<01:57,  8.16it/s]  4%|▍         | 44/1000 [00:02<02:03,  7.77it/s]  5%|▍         | 46/1000 [00:03<02:33,  6.22it/s]  5%|▍         | 48/1000 [00:03<02:53,  5.49it/s]  5%|▍         | 49/1000 [00:04<03:08,  5.04it/s]  5%|▌         | 50/1000 [00:04<03:36,  4.40it/s]  5%|▌         | 51/1000 [00:04<03:40,  4.30it/s]  5%|▌         | 52/1000 

In [10]:
with model:
    f_pred = gp.conditional('pred', x_star)
    samples = pm.sample_posterior_predictive(trace
                                             , vars=[f_pred]
                                             , samples=10) 


  result[diagonal_slice] = x
  result[diagonal_slice] = x
 10%|█         | 1/10 [00:03<00:33,  3.74s/it]100%|██████████| 10/10 [00:03<00:00,  2.63it/s]


In [15]:
samples['pred']

array([[-1.77389489e-01, -2.06628884e-01,  5.83708965e-01,
         1.10258222e-01,  7.63643347e-01, -1.25819540e+00,
         3.89509412e-01,  2.08765319e-01,  5.50838302e-01,
        -1.60236550e-01, -1.12190920e+00,  1.20231336e-01,
        -1.17006831e+00,  5.89768336e-01,  2.05081559e+00,
         8.01381034e-01, -5.07804645e-02, -1.71191588e+00,
        -7.32029847e-01,  1.56055017e+00,  9.71316448e-02,
         1.83584351e+00, -1.18987268e+00, -1.97883212e+00,
         6.98051034e-01, -5.76768998e-01, -2.52042307e+00,
         1.03421626e+00, -9.35481222e-01, -2.56873436e-01,
         1.33431918e+00,  1.85240142e+00, -2.86483822e-01,
         8.53696548e-01, -1.36176662e+00, -1.95725309e-01,
         5.41295211e-02,  9.19443415e-01, -7.90846309e-01,
        -3.72545064e-01, -9.62200963e-01, -5.04210889e-01,
        -5.69668143e-01, -1.88561231e+00, -3.71818006e-01,
         5.87627252e-01, -8.91095642e-01,  1.52989502e+00,
         1.05613530e+00,  1.07272619e+00, -3.03878102e-0