In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df = pd.read_csv('freight_data/forecast_prep.csv')#, index_col=0)

df['Year'] = pd.to_datetime(df['Year'], format='%Y')
df.set_index('Year', inplace=True)
df.index.freq = 'YS'
commodity_mapping = {'5': 'Meat/seafood', '8': 'Alcoholic beverages',
                     '9': 'Tobacco prods.', '21': 'Pharmaceuticals'}

In [None]:
df

### Time-Series Analysis

Rolling validation: 2013-2018, predicting 2019

In [4]:
x_train = df.loc['2013':'2018'].copy()
# x_valid = df.loc[['2022']].copy()
x_test = df.loc[['2019']].copy()
x_test = pd.DataFrame(x_test, columns=df.columns)

In [5]:
x_test

In [11]:
scaler = StandardScaler()
features = ['MEHOINUSGAA672N', 'GARETAILNQGSP', 'Population',
            'tons_5_lagged', 'tons_8_lagged', 'tons_9_lagged', 'tons_21_lagged',
            'value_5_lagged', 'value_8_lagged', 'value_9_lagged', 'value_21_lagged']
value_columns = ['value_5', 'value_8', 'value_9', 'value_21']
tons_columns = ['tons_5', 'tons_8', 'tons_9', 'tons_21']

def apply_scaler(data_frame, features):
    shipment_features = data_frame[features]
    scaled_shipments = scaler.fit_transform(shipment_features)
    scaled_shipments = pd.DataFrame(scaled_shipments, index=shipment_features.index, columns=shipment_features.columns)
    data_frame[features] = scaled_shipments[features]
    return data_frame

x_train_scaled = apply_scaler(x_train, features)
x_test_scaled = apply_scaler(x_test, features)

In [12]:
x_train_scaled.index

In [7]:
fig = px.line(x_train_scaled, x=x_train_scaled.index, y='value_5', title='Value of Meat/Seafood freight shipments annually')
fig.show()

Model 1: For commodity group 5 (Meat/Seafood), Expected Value

In [8]:
y_train = x_train_scaled['value_5']
y_test = x_test_scaled['value_5']
specific_cols = ['tons_8_lagged', 'tons_9_lagged', 'tons_21_lagged',
                 'value_8_lagged', 'value_9_lagged', 'value_21_lagged']
x_train_scaled = x_train_scaled.drop(columns=tons_columns+value_columns+specific_cols)
x_test_scaled = x_test_scaled.drop(columns=tons_columns+value_columns+specific_cols)

### Visualizing value_5 trend

In [9]:
x_train_scaled

In [10]:
arima_order = (1, 1, 1)
model = ARIMA(endog=y_train, exog=x_train_scaled, order=arima_order)
model_fit = model.fit()

start_index = y_test.index[0]
end_index = y_test.index[-1]
predictions = model_fit.predict(start=start_index, end=end_index, exog=x_test_scaled)
predictions.index = y_test.index

print(mean_squared_error(y_test, predictions))

## Day 12

Using 2017-2023 data instead for forecasting.

In [73]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, root_mean_squared_error
import matplotlib.pyplot as plt
import plotly.express as px
from utils.feature_engineering import apply_scaler

In [74]:
shipment_df = pd.read_csv('../freight_data/processed/Georga_AIS_2012-2023_minus_inflation.csv')
shipment_df.rename(columns={'Unnamed: 0': 'Year'}, inplace=True)

In [75]:
shipment_df = shipment_df[shipment_df.Year >= 2017]
shipment_df['Year'] = pd.to_datetime(shipment_df['Year'], format='%Y')
shipment_df.set_index('Year', inplace=True)
shipment_df.index.freq = 'YS'

In [76]:
shipment_df

In [77]:
# commodity_mapping = {'5': 'Meat/seafood', '8': 'Alcoholic beverages',
#                      '9': 'Tobacco prods.', '21': 'Pharmaceuticals'}

In [78]:
x_train = shipment_df.loc['2017': '2021'].copy()
x_valid = shipment_df.loc[['2022']].copy()
x_test = shipment_df.loc[['2023']].copy()
# x_test = pd.DataFrame(x_test, columns=df.columns)

In [79]:
x_train

In [80]:
features = ['MEHOINUSGAA672N', 'GARETAILNQGSP', 'Population']
x_train_scaled, fitted_scaler = apply_scaler(x_train, features, 'train', scaler=StandardScaler())
x_valid_scaled, _ = apply_scaler(x_valid, features, mode='valid', scaler=fitted_scaler)

In [81]:
y_train = x_train_scaled['value_5']
y_valid = x_valid_scaled['value_5']

In [82]:
value_columns = ['value_5', 'value_8', 'value_9', 'value_21']
tons_columns = ['tons_5', 'tons_8', 'tons_9', 'tons_21']

x_train_scaled = x_train_scaled.drop(columns=tons_columns + value_columns)
x_valid_scaled = x_valid_scaled.drop(columns=tons_columns + value_columns)

In [84]:
arima_order = (1, 2, 3)
model = SARIMAX(endog=y_train, exog=x_train_scaled, order=arima_order)
model_fit = model.fit()

start_index = y_valid.index[0]
end_index = y_valid.index[-1]

predictions = model_fit.predict(start=start_index, end=end_index, exog=x_valid_scaled)
predictions.index = y_valid.index

print("Prediction on expected value", predictions.iloc[0])
print("Actual value", y_valid.iloc[0])
print("Mean square error:", mean_squared_error(y_valid, predictions))
print("Root mean square error:", root_mean_squared_error(y_valid, predictions))