In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Forecast Error or Residual Forecast Error
expected = [0.0, 0.5, 0.0, 0.5, 0.0]
predictions = [0.2, 0.4, 0.1, 0.6, 0.2]
forecast_errors = [expected[i]-predictions[i] for i in range(len(expected))]
print('Forecast Errors: %s' % forecast_errors)

## Mean Forecast Error or Forecast Bias 
expected = [0.0, 0.5, 0.0, 0.5, 0.0]
predictions = [0.2, 0.4, 0.1, 0.6, 0.2]
forecast_errors = [expected[i]-predictions[i] for i in range(len(expected))]
bias = sum(forecast_errors) * 1.0/len(expected)
print('Bias: %f' % bias)

## MAE, MSE and RMSE 

In [3]:
# Persistence Model for Forecasting 
## It is comprised of three steps - the TEST HARNESS
## The dataset to be used to train and evaluate models
## The resampling technique to be used to estimate the performance of the technique, i.e., train-test split 
## Performance measures, i.e MAE

# Persistence models in TS = The simplest BASELINE model 
# A model that is SIMPLE, FAST and REPEATBALE = Provides deterministic outputs. Gives expected outputs for a given input 

In [4]:
## Transform the univariate dataset into a supervised learning problem 
## Establish the train and test datasets for the test harness
## Define the persistence model 
## Make a forecast and establish a performance baseline 
## Review example and plot the output 

In [5]:
## Define the dataset 
series = pd.read_csv('../input/shampoo-saled-dataset/shampoo_sales.csv', header=0, index_col=0, parse_dates=True, squeeze=True, on_bad_lines='skip')
dataframe = pd.concat([series.shift(1), series], axis=1)
dataframe.columns = ['t', 't+1']
dataframe.head()

In [6]:
## Train and test sets 
## Split the dataset into training and test sets 
X = dataframe.values 
train_size = int(len(X) * 0.66)
train, test = X[1:train_size], X[train_size:]
train_X, train_y = train[:,0], train[:,1]
test_X, test_y = test[:,0], test[:,1]

In [7]:
# Persistence Algorithm 
def model_persistence(x):
    return x

In [8]:
from sklearn.metrics import mean_squared_error

# Walk-Forward validation as we have incorporated the inputs from previous timesteps into the next one 
preds = []
for x in test_X:
    yhat = model_persistence(x)
    preds.append(yhat)
rmse = np.sqrt(mean_squared_error(test_y, preds))
print('Test RMSE: %.3f' % rmse)

In [9]:
test_X

In [10]:
# Plotting the results of this model 
import matplotlib.pyplot as plt 
plt.plot(train_y)
plt.plot([None for i in train_y] + [target for target in test_y])
plt.plot([None for i in train_y] + [pred for pred in preds])
plt.show()

# Visualizing Persistence Model Forecast 

In [30]:
# Create a lagged dataset 
values = pd.read_csv("../input/daily-total-female-births-in-california-1959/daily-total-female-births-CA.csv", header=0, index_col=0, parse_dates=True, on_bad_lines='skip')
data = pd.concat([values.shift(1), values], axis=1)
data.columns = ['t', 't+1']

# Remove data with NaN values
data = data.dropna()

# Perform the splitting process
X = data.values
train_size = int(len(X) * 0.66)
train_data, test_data = X[:train_size], X[train_size:]
X_train, y_train = train_data[:, 0], train_data[:, 1]
X_test, y_test = test_data[:, 0], test_data[:, 1]

# Apply the persistence model 
preds = [x for x in X_test]

# Capture residuals 
res = [y_test[i]- preds[i] for i in range(len(preds))]

# Plot the residuals
plt.subplot(211)
plt.plot(res)
plt.show()

# Plot histogram 
plt.subplot(212)
plt.hist(res)
plt.show()

In [38]:
# List summary statistics
res = pd.DataFrame(res)
print(res.describe())

# A naive form of bias correction would be to add residual errors to forecasts
# But this method is naive, and it may not improve the model significantly 

In [41]:
# Better ways to check for the randomness in plots 
# Plot line, KDE, histogram, QQ plpts and autocorrelation plots 
from pandas.plotting import autocorrelation_plot as ap
ap(res)

# The autocorrelation values should be approaching zero, which is not the case for this set of residual errors
# Hence, bias correction is required