In [None]:
# Import libraries
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
from matplotlib.dates import ConciseDateFormatter, AutoDateLocator
import seaborn as sns
from scipy import signal

from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [None]:
weather = pd.read_csv('./data/weather_data.csv')
fremont = weather[weather.NAME == 'FREMONT CANYON CALIFORNIA, CA US'].copy()
fremont['DATE'] = pd.to_datetime(fremont['DATE'])
fremont = fremont.set_index('DATE')
fremont = fremont.asfreq('d')

# Backfill empty dates
fremont.loc['2016-02-20'] = fremont.loc['2016-02-19']
fremont.loc['2016-02-21'] = fremont.loc['2016-02-20']
fremont.loc['2021-07-22'] = fremont.loc['2021-07-21']
fremont.loc['2023-12-31'] = fremont.loc['2023-12-30']

# Fremont train data before 2024
fremont_train = fremont[fremont.index < '2024-01-01']
fremont_valid = fremont[(fremont.index >= '2024-01-01') & (fremont.index < '2025-01-01')]
fremont_valid_y = fremont_valid.TAVG

FIGSIZE = (28,10)

In [None]:
fremont

In [None]:
plt.figure(figsize=FIGSIZE)
plt.xticks(rotation=90)
plt.xlabel("Year")
plt.ylabel("Average Temp. (F)")
plt.title("Average Daily Temperature (2015 - 2025)")
plt.suptitle("FREMONT CANYON CALIFORNIA, CA US")

ax = plt.gca()
locator = AutoDateLocator()
formatter = ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

G = sns.lineplot(ax=ax, data=fremont, x=fremont.index, y=fremont.TAVG, linewidth=1)

In [None]:
year_ma = fremont.TAVG.rolling(window=365, center=True, min_periods=180).mean()
month_ma = fremont.TAVG.rolling(window=30, center=True, min_periods=15).mean()
week_ma = fremont.TAVG.rolling(window=7, center=True).mean()

plt.figure(figsize=FIGSIZE)
plt.xticks(rotation=90)
plt.xlabel("Year")
plt.ylabel("Average Temp. (F)")

ax = plt.gca()
locator = AutoDateLocator()
formatter = ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

sns.lineplot(ax=ax, data=fremont, x=fremont.index, y=fremont.TAVG, alpha=.2, linewidth=1.5, label="Daily Temperature")
sns.lineplot(ax=ax, x=year_ma.index, y=year_ma, linewidth=3, label="Year MA")
sns.lineplot(ax=ax, x=month_ma.index, y=month_ma, linewidth=3, label="Month MA")

In [None]:
# TEMPLATE

# plt.figure(figsize=FIGSIZE)
# plt.xticks(rotation=90)
# plt.xlabel("Year")
# plt.ylabel("Average Temp. (F)")
# plt.title(f"[NAME] Model Predictions (MAE: {0:.2f}) - (MAPE: {(0 * 100):.2f}%) - (Accuracy: {(0 * 100):.2f}%) - (MSE: {0:.2F})")

# ax = plt.gca()
# locator = AutoDateLocator()
# formatter = ConciseDateFormatter(locator)
# ax.xaxis.set_major_locator(locator)
# ax.xaxis.set_major_formatter(formatter)

# sns.lineplot(ax=ax, data=fremont, x=fremont.index, y=fremont.TAVG, alpha=.2, linewidth=1.5, label="Daily Temperature")


In [None]:
linear_model = LinearRegression(fit_intercept=False)
linear_pipeline = Pipeline(steps=[('model', linear_model)])

linear_model_dp = DeterministicProcess(index=fremont_train.index, constant=True, order=1, drop=True)
linear_model_X = linear_model_dp.in_sample()
linear_model_X_valid = linear_model_dp.out_of_sample(steps=366)
linear_model_y = fremont_train.TAVG
linear_pipeline.fit(linear_model_X, linear_model_y)
linear_model_train_preds = pd.Series(linear_pipeline.predict(linear_model_X), index=linear_model_X.index)
linear_model_valid_preds = pd.Series(linear_pipeline.predict(linear_model_X_valid), index=linear_model_X_valid.index)

# Test model accuracy
linear_model_mae = mean_absolute_error(fremont_valid_y, linear_model_valid_preds)
linear_model_mape = mean_absolute_percentage_error(fremont_valid_y, linear_model_valid_preds)
linear_model_accuracy = 1 - linear_model_mape
linear_model_mse = mean_squared_error(fremont_valid_y, linear_model_valid_preds)
linear_model_max_err = pd.Series(abs(fremont_valid_y - linear_model_valid_preds)).max()

plt.figure(figsize=FIGSIZE)
plt.xticks(rotation=90)
plt.xlabel("Year")
plt.ylabel("Average Temp. (F)")
plt.title(f"Linear Model Predictions (MAE: {linear_model_mae:.2f}) - (MAPE: {(linear_model_mape * 100):.2f}%) - (Accuracy: {(linear_model_accuracy * 100):.2f}%) (MSE: {linear_model_mse:.2f}) - (Max Error {linear_model_max_err:.2f})")

ax = plt.gca()
locator = AutoDateLocator()
formatter = ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

sns.lineplot(ax=ax, data=fremont, x=fremont.index, y=fremont.TAVG, alpha=.2, linewidth=1.5, label="Daily Temperature")
sns.lineplot(ax=ax, x=linear_model_train_preds.index, y=linear_model_train_preds, linewidth=3, label="Training Data")
sns.lineplot(ax=ax, x=linear_model_valid_preds.index, y=linear_model_valid_preds, linewidth=3, label="Model Predictions")

In [None]:
# Fourier Linear Regression Model
fourier_lr = CalendarFourier(freq="YE", order=8)
fourier_lr_dp = DeterministicProcess(index=fremont_train.index, constant=True, order=1, seasonal=False, additional_terms=[fourier_lr], drop=True)
fourier_lr_X_train = fourier_lr_dp.in_sample()
fourier_lr_model_valid_X = fourier_lr_dp.out_of_sample(steps=366)

fourier_lr_model = LinearRegression(fit_intercept=False)
fourier_lr_model.fit(fourier_lr_X_train, fremont_train.TAVG)
fourier_lr_model_train_preds = pd.Series(fourier_lr_model.predict(fourier_lr_X_train), index=fourier_lr_X_train.index)
fourier_lr_model_valid_preds = pd.Series(fourier_lr_model.predict(fourier_lr_model_valid_X), index=fourier_lr_model_valid_X.index)

# Test model accuracy
fourier_lr_model_mae = mean_absolute_error(fremont_valid_y, fourier_lr_model_valid_preds)
fourier_lr_model_mape = mean_absolute_percentage_error(fremont_valid_y, fourier_lr_model_valid_preds)
fourier_lr_model_accuracy = 1 - fourier_lr_model_mape
fourier_lr_model_mse = mean_squared_error(fremont_valid_y, fourier_lr_model_valid_preds)
fourier_lr_max_err = pd.Series(abs(fremont_valid_y - fourier_lr_model_valid_preds)).max()

plt.figure(figsize=FIGSIZE)
plt.xticks(rotation=90)
plt.xlabel("Year")
plt.ylabel("Average Temp. (F)")
plt.title(f"Fourier Linear Regression Model Predictions [Order = 8] (MAE: {fourier_lr_model_mae:.2f}) - (MAPE: {(fourier_lr_model_mape * 100):.2f}%) - (Accuracy: {(fourier_lr_model_accuracy * 100):.2f}%) - (MSE: {fourier_lr_model_mse:.2F}) - (Max Error: {fourier_lr_max_err:.2f})")

ax = plt.gca()
locator = AutoDateLocator()
formatter = ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

sns.lineplot(ax=ax, data=fremont, x=fremont.index, y=fremont.TAVG, alpha=.2, linewidth=1.5, label="Daily Temperature")
sns.lineplot(ax=ax, x=fourier_lr_model_train_preds.index, y=fourier_lr_model_train_preds, linewidth=3, label="Training Data")
G = sns.lineplot(ax=ax, x=fourier_lr_model_valid_preds.index, y=fourier_lr_model_valid_preds, linewidth=3, label="Model Predictions")

In [None]:
# Check for serial dependence
fremont_lag_1 = pd.DataFrame({'y': fremont.TAVG, 'y_lag_1': fremont.TAVG.shift(1)})
fremont_lag_1 = fremont_lag_1.bfill()
fremont_lag_corr = fremont_lag_1.corr().y['y_lag_1']

plt.title(f"1 Day Lag Plot (corr: {fremont_lag_corr:.2f})")
sns.regplot(x=fremont_lag_1.y_lag_1, y=fremont_lag_1.y, line_kws=dict(color="orange"))

In [None]:
# 1 Day Lag Model
lag_1_model = LinearRegression(fit_intercept=True)
fremont_train_lag_1 = pd.DataFrame({'y_lag_1': fremont_train.TAVG.shift(1)})
fremont_train_lag_1 = fremont_train_lag_1.bfill()
lag_1_X = fremont_train_lag_1.copy()
lag_1_y = fremont_train.TAVG
lag_1_model.fit(lag_1_X, lag_1_y)

lag_1_model_train_preds = pd.Series(lag_1_model.predict(lag_1_X), index=lag_1_X.index)

fremont_valid_X = pd.DataFrame({'y_lag_1': fremont_valid.TAVG.shift(1)})
fremont_valid_X = fremont_valid_X.bfill()
lag_1_model_valid_preds = pd.Series(lag_1_model.predict(fremont_valid_X), index=fremont_valid_X.index)

# Test model accuracy
lag_1_model_mae = mean_absolute_error(fremont_valid_y, lag_1_model_valid_preds)
lag_1_model_mape = mean_absolute_percentage_error(fremont_valid_y, lag_1_model_valid_preds)
lag_1_model_accuracy = 1 - lag_1_model_mape
lag_1_model_mse = mean_squared_error(fremont_valid_y, lag_1_model_valid_preds)
lag_1_model_max_err = pd.Series(abs(fremont_valid_y - lag_1_model_valid_preds)).max()

plt.figure(figsize=FIGSIZE)
plt.xticks(rotation=90)
plt.xlabel("Year")
plt.ylabel("Average Temp. (F)")
plt.title(f"1 Day Lag Model Predictions (MAE: {lag_1_model_mae:.2f}) - (MAPE: {(lag_1_model_mape * 100):.2f}%) - (Accuracy: {(lag_1_model_accuracy * 100):.2f}%) - (MSE: {lag_1_model_mse:.2F}) - (Max Error {lag_1_model_max_err:.2f})")

ax = plt.gca()
locator = AutoDateLocator()
formatter = ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

G = sns.lineplot(ax=ax, data=fremont, x=fremont.index, y=fremont.TAVG, alpha=.2, linewidth=1.5, label="Daily Temperature")
G = sns.lineplot(ax=ax, x=lag_1_model_train_preds.index, y=lag_1_model_train_preds, linewidth=3, label="Training Data")
G = sns.lineplot(ax=ax, x=lag_1_model_valid_preds.index, y=lag_1_model_valid_preds, linewidth=3, label="Model Predictions")

In [None]:
print(f"{'Predicted on April 11, 2024:':30} {lag_1_model_valid_preds.loc['2024-04-11']:.1f} F")
print(f"{'Actual on April 11, 2024:':30} {fremont.loc['2024-04-11'].TAVG} F")