In [1]:
import sys
sys.path.insert(0, "/home/bill/pymc3/")
import pymc3 as pm

import pandas as pd
import patsy as pt
import numpy as np

import theano.tensor as tt

## actual analysis

Here is a [link to the raw data](http://scrippsco2.ucsd.edu/data/atmospheric_co2/mlo).  It is updated fairly often, and other related data sets are located here.

In [4]:
#data_monthly = pd.read_csv(pm.get_data("monthly_in_situ_co2_mlo.csv"), header=56)
 
# - replace -99.99 with NaN
data_monthly.replace(to_replace=-99.99, value=np.nan, inplace=True)

# fix column names
cols = ["year", "month", "--", "--", "CO2", "seasonaly_adjusted", "fit",
        "seasonally_adjusted_fit", "CO2_filled", "seasonally_adjusted_filled"]
data_monthly.columns = cols
cols.remove("--"); cols.remove("--")
data_monthly = data_monthly[cols]

# drop rows with nan
data_monthly.dropna(inplace=True)

# fix time index
data_monthly["day"] = 15
data_monthly.index = pd.to_datetime(data_monthly[["year", "month", "day"]])
cols.remove("year"); cols.remove("month")
data_monthly = data_monthly[cols]

data_monthly.head(5)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 33, saw 4


In [3]:
reference_time = pd.to_datetime('1958-03-15')
d = data_monthly.index - reference_time
t = d / pd.Timedelta(1, "Y")

## normalize co2 reading levels
y = data_monthly["CO2"].values
mean_co2 = np.mean(y)
std_co2 = np.std(y)
y_n = (y - mean_co2) / std_co2

data_monthly = data_monthly.assign(t = t)
data_monthly = data_monthly.assign(y_n = y_n)
data_monthly.head(5)

Unnamed: 0,CO2,seasonaly_adjusted,fit,seasonally_adjusted_fit,CO2_filled,seasonally_adjusted_filled,t,y_n
1958-03-15,315.69,314.43,316.18,314.9,315.69,314.43,0.0,-1.396994
1958-04-15,317.46,315.15,317.3,314.98,317.46,315.15,0.084875,-1.330174
1958-05-15,317.5,314.73,317.84,315.06,317.5,314.73,0.167012,-1.328663
1958-07-15,315.86,315.17,315.87,315.22,315.86,315.17,0.334025,-1.390576
1958-08-15,314.93,316.17,314.01,315.29,314.93,316.17,0.4189,-1.425686


In [4]:
# split into training and test set
sep_idx = data_monthly.index.searchsorted(pd.to_datetime("2003-12-15"))
data_prior = data_monthly.iloc[:sep_idx+1, :]
data_after = data_monthly.iloc[sep_idx:, :]

In [5]:
# make plot
from bokeh.plotting import figure, show
from bokeh.palettes import Magma8
from bokeh.models import BoxAnnotation, Span
from bokeh.io import output_notebook

output_notebook()
p = figure(x_axis_type='datetime', title='Monthly CO2 Readings from Mauna Loa',
           plot_width=700, plot_height=300)
p.yaxis.axis_label = 'CO2 [ppm]'
p.xaxis.axis_label = 'Date'
predict_region = BoxAnnotation(left=pd.to_datetime("2003-12-15"), 
                               fill_alpha=0.5, fill_color=Magma8[7])
p.add_layout(predict_region)
ppm400 = Span(location=400,
              dimension='width', line_color='red',
              line_dash='dashed', line_width=2)
p.add_layout(ppm400)

p.line(data_monthly.index, data_monthly['CO2'], 
       line_width=2, line_color=Magma8[0], alpha=0.5)
p.circle(data_monthly.index, data_monthly['CO2'], 
         line_color=Magma8[0], alpha=0.1, size=2)


show(p)

## Build model

In [6]:
# Build model
n1 = len(data_prior)
n2 = len(data_after)
y_n = data_prior["y_n"].values

X = np.concatenate((np.ones((n1,1)), data_prior["t"].values[:,None]),1)
# initialize linear fit
β_init = np.linalg.lstsq(X, y_n)[0]
print(β_init)

[-1.61115985  0.05137085]


In [26]:
with pm.Model() as model:
    PosNormal = pm.Bound(pm.Normal, lower=0.0)
    
with model:
    # yearly periodic component x long term trend
    # if l1 is long, then periodic component decays slowly
    η1 = pm.HalfCauchy("η1", beta=5, testval=1.0)
    ℓ1 = pm.Gamma("ℓ1", alpha=3, beta=0.025, testval=75)
    p  = PosNormal("p", mu=1, sd=0.1, testval=1.0)
    # lp centered around p
    ℓp = pm.Gamma("ℓp", alpha=2, beta=2, testval=1.0)
    cov_seasonal = tt.square(η1) * pm.gp.cov.Periodic(2, ℓp, p, active_dims=[1]) \
                                 * pm.gp.cov.Matern52(2, ℓ1, active_dims=[1])
    gp_seasonal = pm.gp.Marginal(cov_func=cov_seasonal)
    
    # small/medium term irregularities
    η2 = pm.HalfCauchy("η2", beta=5, testval=1.0)
    ℓ2 = pm.Gamma("ℓ2", alpha=1.1, beta=0.2, testval=5.0)
    ##α = pm.HalfStudentT("α", mu=1.0, lam=1.0, nu=4)
    #α = pm.Gamma("α", alpha=1.1, beta=0.1, testval=2) # start with some diffusivity
    cov_medium = tt.square(η2) * pm.gp.cov.ExpQuad(2, ℓ2, active_dims=[1])
    gp_medium = pm.gp.Marginal(cov_func=cov_medium)
    
    # long term trend
    η3 = pm.HalfCauchy("η3", beta=5, testval=1.0)
    ℓ3 = pm.Gamma("ℓ3", alpha=3, beta=0.1, testval=30.0)
    cov_trend = tt.square(η3) * pm.gp.cov.ExpQuad(2, ℓ3, active_dims=[1])
   
    # positive trend, linear mean function
    β = pm.Normal("β", mu=0.0, sd=10, shape=2)
    mean = pm.gp.mean.Linear(β)
    gp_trend = pm.gp.Marginal(mean, cov_trend)   

    # noise model
    ηn = pm.HalfNormal("ηn", sd=1, testval=0.1)
    ℓn = pm.Gamma("ℓn", alpha=1.05, beta=3, testval=0.05)
    σ  = pm.HalfNormal("σ",  sd=1, testval=0.05)
    cov_noise = tt.square(ηn) * pm.gp.cov.Matern52(2, ℓn, active_dims=[1]) +\
                pm.gp.cov.WhiteNoise(σ)
    
    gp = gp_seasonal + gp_medium + gp_trend
    
    #mu = mean(X)
    #K = cov(X) + covn(X)
    #L = tt.slinalg.cholesky(K)
    #gp = pm.MvNormal("gp", mu=mu, chol=L, observed=yn)
    y_ = gp.prior("y_", n_points=X.shape[0], X=X, y=y_n, noise=cov_noise)
    #gp = pm.gp.GP("gp", X, cov_func=cov, cov_func_noise=covn, 
    #              mean_func=mean, observed=yn)
    start = pm.find_MAP(method="CG")
    tr = pm.sample(1000)
    #tr = pm.fit(50000, method='fullrank_advi').sample(10000)

lp = nan, ||grad|| = 13.645:   1%|          | 259/50000 [01:04<2:54:29,  4.75it/s]        
Auto-assigning NUTS sampler...
Initializing NUTS using advi+adapt_diag...
Average Loss = -1,161.7:  11%|█▏        | 22900/200000 [1:07:02<7:27:33,  6.60it/s] 
Convergence archived at 22900
Interrupted at 22,900 [11%]: Average Loss = -951.91
  % (self._chain_id, n_diverging))



In [2]:
%matplotlib inline
pm.traceplot(tr);

NameError: name 'pm' is not defined

In [28]:
# predict total
n = len(data_monthly)
Xnew = np.concatenate((np.ones((n, 1)), data_monthly["t"].values[:,None]),1)

n_draws = 100
with model:
    f_pred = gp.conditional("f_pred", n_points=n, Xnew=Xnew)
    samples = pm.sample_ppc([start], vars=[f_pred], samples=n_draws)

 34%|███▍      | 34/100 [00:17<00:28,  2.29it/s]


In [12]:
preds = samples[f_pred.name]*std_co2 + mean_co2

## make plot
p = figure(x_axis_type='datetime', plot_width=700, plot_height=300)

# previous plot
p.yaxis.axis_label = 'CO2 [ppm]'
p.xaxis.axis_label = 'Date'

p.line(data_monthly.index, data_monthly['CO2'], 
       line_width=1, line_color="black")

predict_region = BoxAnnotation(left=pd.to_datetime("2003-12-15"), 
                               fill_alpha=0.1, fill_color='firebrick')
ppm400 = Span(location=400,
              dimension='width', line_color='black',
              line_dash='dashed', line_width=1)
p.add_layout(predict_region)
p.add_layout(ppm400)

# predictions
p.multi_line([data_monthly.index]*n_draws, [preds[i,:] for i in range(n_draws)],
             color="green", alpha=0.05)
show(p)



In [16]:
with model:
    f_seasonal = gp_seasonal.conditional("f_seasonal", n_points=n, 
                                         Xnew=Xnew, X=X, y=y_n, noise=cov_noise)
    samples = pm.sample_ppc([start], vars=[f_seasonal], samples=n_draws)

100%|██████████| 100/100 [00:38<00:00,  2.66it/s]


In [17]:
preds = samples[f_seasonal.name]*std_co2 + mean_co2

preds_deseason = [data_monthly["CO2"].values - preds[i,:] for i in range(n_draws)]

## make plot
p = figure(x_axis_type='datetime', plot_width=700, plot_height=300)

# previous plot
p.yaxis.axis_label = 'CO2 [ppm]'
p.xaxis.axis_label = 'Date'

#p.line(data_prior.index, data_prior['CO2'], 
#       line_width=1, line_color="black")
#p.line(data_after.index, data_after['CO2'], 
#       line_width=1, line_color="black", alpha=0.5)

predict_region = BoxAnnotation(left=pd.to_datetime("2003-12-15"), 
                               fill_alpha=0.1, fill_color='firebrick')
#ppm400 = Span(location=400,
#              dimension='width', line_color='black',
#              line_dash='dashed', line_width=1)
p.add_layout(predict_region)
#p.add_layout(ppm400)

# predictions
p.multi_line([data_monthly.index]*n_draws, [preds_deseason[i] for i in range(n_draws)],
             color="green", alpha=0.05)
show(p)

In [20]:
with model:
    f_medium = gp_medium.conditional("f_medium1", n_points=n,
                                     Xnew=Xnew, X=X, y=y_n, noise=cov_noise)
    samples = pm.sample_ppc([start], vars=[f_medium], samples=n_draws)

100%|██████████| 100/100 [00:33<00:00,  2.93it/s]


In [21]:
preds = samples[f_medium.name]*std_co2 + mean_co2

## make plot
p = figure(x_axis_type='datetime', plot_width=700, plot_height=300)

# previous plot
p.yaxis.axis_label = 'CO2 [ppm]'
p.xaxis.axis_label = 'Date'

predict_region = BoxAnnotation(left=pd.to_datetime("2003-12-15"), 
                               fill_alpha=0.1, fill_color='firebrick')
#ppm400 = Span(location=400,
#              dimension='width', line_color='black',
#              line_dash='dashed', line_width=1)
p.add_layout(predict_region)
#p.add_layout(ppm400)

# predictions
p.multi_line([data_monthly.index]*n_draws, [preds[i,:] for i in range(n_draws)],
             color="green", alpha=0.05)
show(p)

In [22]:
with model:
    f_trend = gp_trend.conditional("f_trend", n_points=n,
                                   Xnew=Xnew, X=X, y=y_n, noise=cov_noise) 
    samples = pm.sample_ppc([start], vars=[f_trend], samples=n_draws)

100%|██████████| 100/100 [00:30<00:00,  3.74it/s]


In [23]:
preds = samples[f_trend.name]*std_co2 + mean_co2

## make plot
p = figure(x_axis_type='datetime', plot_width=700, plot_height=300)

# previous plot
p.yaxis.axis_label = 'CO2 [ppm]'
p.xaxis.axis_label = 'Date'

p.line(data_monthly.index, data_monthly['CO2'], 
       line_width=1, line_color="black")

predict_region = BoxAnnotation(left=pd.to_datetime("2003-12-15"), 
                               fill_alpha=0.1, fill_color='firebrick')
#ppm400 = Span(location=400,
#              dimension='width', line_color='black',
#              line_dash='dashed', line_width=1)
p.add_layout(predict_region)
#p.add_layout(ppm400)

# predictions
p.multi_line([data_monthly.index]*n_draws, [preds[i,:] for i in range(n_draws)],
             color="green", alpha=0.05)
show(p)