In [28]:
import os
from sklearn.linear_model import PoissonRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error 
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy.stats import poisson
from scipy.stats import nbinom
from patsy import dmatrices
os.chdir(os.path.expanduser("~/Desktop/Classes/Research/deep_uncertainty/deep_uncertainty"))
from deep_uncertainty.evaluation.calibration import compute_mean_calibration

# Bikes Data

# Poisson

In [13]:
data = pd.read_csv('deep_uncertainty/data/bikes/day.csv')

In [14]:
X = data.iloc[:,:-1]
X = X.drop(['dteday', 'instant'], axis=1)
mask = np.random.rand(len(data)) < 0.8
df_train = data[mask]
df_test = data[~mask]

expr = "cnt ~ " + " + ".join(data.columns[:-1])
expr

y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
# print(poisson_training_results.summary())

In [16]:
y_preds = poisson_training_results.mu
posterior_predictive = poisson(mu=y_preds)
posterior_predictive
len(y_preds)

581

In [17]:
# Poisson calibration
compute_mean_calibration(np.array(y_test), posterior_predictive)

0.3529786891772373

# Negative Binomial

# News Data

# Poisson

In [18]:
news = pd.read_csv('deep_uncertainty/data/news/OnlineNewsPopularity.csv')
news = news.drop(labels = "url", axis = 1)
news.rename(columns = lambda x: x.strip(), inplace = True)

mask = np.random.rand(len(news)) < 0.8
df_train = news[mask]
df_test = news[~mask]

expr = "shares ~ " + " + ".join(news.columns[:-1])
expr

y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
# print(poisson_training_results.summary())

In [19]:
y_preds = poisson_training_results.mu
posterior_predictive = poisson(mu=y_preds)
posterior_predictive
len(y_preds)

31726

In [20]:
# Poisson calibration
compute_mean_calibration(np.array(y_test), posterior_predictive)

0.34470697613399937

# Negative Binomial

# Traffic Data

# Poisson

In [35]:
data = pd.read_csv('deep_uncertainty/data/traffic/Metro_Interstate_Traffic_Volume.csv')
# how to break up the date time into year, month, day, hour
data['date_time'] = pd.to_datetime(data['date_time'])
data['year'] = data['date_time'].dt.year
data['month'] = data['date_time'].dt.month
data['day'] = data['date_time'].dt.day
data['hour'] = data['date_time'].dt.hour
data.drop('date_time', axis=1, inplace=True)
# enconde all the categorical variables
le = preprocessing.LabelEncoder()
for column in data.columns:
    if data[column].dtype == type(object):
        data[column] = le.fit_transform(data[column].astype(str))
data.rename(columns = lambda x: x.strip(), inplace = True)

mask = np.random.rand(len(data)) < 0.8
df_train = data[mask]
df_test = data[~mask]

expr = "traffic_volume ~ " + " + ".join(data.columns[:-1])
expr

y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
# print(poisson_training_results.summary())

y_preds = poisson_training_results.mu
posterior_predictive = poisson(mu=y_preds)
posterior_predictive
len(y_preds)

38521

In [26]:
# Poisson calibration
compute_mean_calibration(np.array(y_test), posterior_predictive)

0.347480321449666

# Negative Binomial

In [36]:
df_train["BB_LAMBDA"] = poisson_training_results.mu
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['traffic_volume'] - x['BB_LAMBDA'])**2 - x['BB_LAMBDA']) / x['BB_LAMBDA'], axis=1)
ols_expr = """AUX_OLS_DEP ~ BB_LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()
nb2_training_results = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()
nb2_predictions = nb2_training_results.get_prediction(X_test)
predictions_summary_frame = nb2_predictions.summary_frame()
predicted_counts=predictions_summary_frame['mean']

y_preds = poisson_training_results.mu
y_test = y_test['traffic_volume']
posterior_predictive = nbinom(p=y_preds)
len(y_preds)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["BB_LAMBDA"] = poisson_training_results.mu
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['traffic_volume'] - x['BB_LAMBDA'])**2 - x['BB_LAMBDA']) / x['BB_LAMBDA'], axis=1)
  nb2_training_results = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()


TypeError: _parse_args() missing 1 required positional argument: 'n'

In [None]:
# Negative Binomial calibration
compute_mean_calibration(np.array(y_test), posterior_predictive)