In [1]:
import os
from pathlib import Path
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt

from statsmodels.tsa.statespace.sarimax import SARIMAX

In [2]:
csv_file_path = '/content/bit_sen.csv'

# Extract base name without extension for naming
csv_base = Path(csv_file_path).stem

# Define paths for saving the best model and its metrics
model_history_dir = Path('model_history')
result_history_dir = Path('result_history')

os.makedirs(model_history_dir, exist_ok=True)
os.makedirs(result_history_dir, exist_ok=True)

best_model_filename = f"{csv_base}_best_model.pt"
best_model_path = model_history_dir / best_model_filename
metrics_path = result_history_dir / f"{csv_base}_metrics.txt"

forecast_history = 24  # Number of past time steps to use for input sequences.
forecast_horizon = 1  # Number of future time steps to predict (e.g., 0 means predicting the next step immediately).

batch_size = 64  # Number of samples in each mini-batch
max_iterations = 100
model_name = 'Arima'
order=(5,1,0)
best_mse = float('inf')
include_baseline = True

In [3]:
def log_print(*args, **kwargs):
    with open(metrics_path, 'a') as metrics_file:
        msg = ' '.join(map(str, args))
        print(msg)
        metrics_file.write(msg + '\n')

Pereprocess

In [4]:
# Load Data
try:
  df_raw = pd.read_csv(csv_file_path)
  print("CSV file loaded successfully.")
except Exception as e:
  print(f"Error reading CSV file: {e}")

df_raw.head(10)

CSV file loaded successfully.


Unnamed: 0,positive,negetive,neutral,Date,Price,Open,High,Low,Vol.,Change %
0,0.116094,0.789252,0.094654,2015-01-01,313.9,317.5,317.5,312.9,0.93K,-0.99%
1,0.113846,0.795338,0.090816,2015-01-02,315.1,313.6,316.0,313.0,1.32K,0.40%
2,0.114822,0.79515,0.090027,2015-01-03,282.0,314.8,314.8,282.0,1.30K,-10.51%
3,0.114798,0.790363,0.094839,2015-01-04,258.8,282.0,288.9,258.8,2.38K,-8.23%
4,0.117028,0.790906,0.092066,2015-01-05,273.2,261.0,276.5,260.5,2.68K,5.57%
5,0.113413,0.791942,0.094645,2015-01-06,285.4,273.2,287.8,268.5,6.86K,4.46%
6,0.117202,0.788809,0.093989,2015-01-07,297.0,286.1,303.8,285.0,1.53K,4.07%
7,0.11449,0.784668,0.100843,2015-01-08,285.6,296.2,296.2,285.6,1.31K,-3.82%
8,0.111742,0.798039,0.090218,2015-01-09,292.8,285.4,295.6,283.0,1.26K,2.52%
9,0.113061,0.791843,0.095096,2015-01-10,275.0,292.4,292.4,271.0,1.11K,-6.08%


In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1786 entries, 0 to 1785
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   positive  1786 non-null   float64
 1   negetive  1786 non-null   float64
 2   neutral   1786 non-null   float64
 3   Date      1786 non-null   object 
 4   Price     1786 non-null   float64
 5   Open      1786 non-null   float64
 6   High      1786 non-null   float64
 7   Low       1786 non-null   float64
 8   Vol.      1776 non-null   object 
 9   Change %  1786 non-null   object 
dtypes: float64(7), object(3)
memory usage: 139.7+ KB


In [6]:
# Preprocess data
datetime_col = "Date"
freq = '1D'
columns = ['Price']
method = 'ffill'
value = 0


# Define a data preprocessing pipeline
preproc_pipe = sklearn.pipeline.Pipeline([
    # Step 1: Drop duplicate rows based on the datetime column
    ('drop_duplicates', sklearn.preprocessing.FunctionTransformer(lambda df: df.drop_duplicates(subset=datetime_col))),

    # Step 2: Fill missing values in the dataset
    ('fill_missing', sklearn.preprocessing.FunctionTransformer(
        lambda df: df.fillna(method=method).fillna(value=value)  # Fill missing values using a specified method and value
    ))
])

# Apply the pipeline to the raw data
df = preproc_pipe.fit_transform(df_raw)

# Convert the datetime column to a standard datetime format
df[datetime_col] = pd.to_datetime(df[datetime_col])

# Sort the data by the datetime column in ascending order
df = df.sort_values(by=datetime_col)

  lambda df: df.fillna(method=method).fillna(value=value)  # Fill missing values using a specified method and value


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1786 entries, 0 to 1785
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   positive  1786 non-null   float64       
 1   negetive  1786 non-null   float64       
 2   neutral   1786 non-null   float64       
 3   Date      1786 non-null   datetime64[ns]
 4   Price     1786 non-null   float64       
 5   Open      1786 non-null   float64       
 6   High      1786 non-null   float64       
 7   Low       1786 non-null   float64       
 8   Vol.      1786 non-null   object        
 9   Change %  1786 non-null   object        
dtypes: datetime64[ns](1), float64(7), object(2)
memory usage: 139.7+ KB


In [8]:
# Define the proportions for test and validation sets
test_size = 0.05  # Fraction of the data to be used as the test set
valid_size = 0.05  # Fraction of the data to be used as the validation set

# Calculate the total number of samples in the dataset
total_size = len(df)  # The total number of rows in the DataFrame

# Calculate the number of samples for the test set
test_split = int(total_size * test_size)  # Convert the test size fraction to the actual count

# Calculate the number of samples for the validation set
valid_split = int(total_size * valid_size)  # Convert the validation size fraction to the actual count

# Calculate the end index of the training set (exclusive)
train_end = total_size - test_split - valid_split  # Determine where the training set ends

# Calculate the end index of the validation set (exclusive)
valid_end = total_size - test_split  # Determine where the validation set ends

# Split the data into training, validation, and test sets using indices
train_df = df.iloc[:train_end]  # Select rows for the training set
valid_df = df.iloc[train_end:valid_end]  # Select rows for the validation set
test_df = df.iloc[valid_end:]  # Select rows for the test set

# Print the sizes of each split for verification
print(f"Data split into train ({len(train_df)}), validation ({len(valid_df)}), and test ({len(test_df)}) sets.")


Data split into train (1608), validation (89), and test (89) sets.


In [9]:
def concat(df):
  '''
    Returns numpy.ndarray: A concatenated array where:
    First columns contain sentiment values (in order: positive, negative, neutral)
    Remaining columns contain scaled price values (order determined by scale_columns)
  '''
  sent = df[['positive', 'negetive', 'neutral']].to_numpy()
  price = scaler.transform(df[scale_columns])

  return np.concat([sent,price], axis=1)

In [10]:
# Define the columns to be scaled
scale_columns = ['Price']  # Specify the column(s) to scale, in this case 'Price'

# Initialize the MinMaxScaler with a feature range of 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit the scaler using the training data
scaler.fit(train_df[scale_columns])  # Calculate the min and max values from the training set for scaling

# Transform the training data using the fitted scaler
train_scaled = concat(train_df)  # Scale the 'Price' column in the training set

# Transform the validation data using the same scaler
valid_scaled = concat(valid_df)  # Apply the same scaling to the validation set

# Transform the test data using the same scaler
test_scaled = concat(test_df)  # Apply the same scaling to the test set

# Print a message indicating that the scaling is complete
print("Data scaling completed.")


Data scaling completed.


###Train

In [None]:
log_print(f"Model Name: {model_name}")
log_print(f"Sequence Length (Forecast History): {forecast_history}")
log_print(f"Forecast Horizon: {forecast_horizon}")
log_print(f"Training Started at: {pd.Timestamp.now()}")
for i in range(max_iterations):
            log_print(f"\nStarting training iteration {i + 1}/{max_iterations}")

            try:
                model = SARIMAX(train_scaled[:,-1],
                                exog = train_scaled[:,:-1],
                                order=order,
                                enforce_stationarity=False,
                                enforce_invertibility=False)
                model_fit = model.fit(disp=False)
                log_print(f"ARIMA model fitted: order={order}")
            except Exception as e:
                log_print(f"Error fitting ARIMA model: {e}")
                continue


            # Validate the model
            try:
                # Forecast validation period
                preds_valid_scaled = model_fit.predict(start=train_end, end=valid_end -1)
                preds_valid_scaled = preds_valid_scaled.reshape(-1, 1)
                preds_valid = scaler.inverse_transform(preds_valid_scaled)
                actual_valid = scaler.inverse_transform(valid_scaled)

                # Calculate validation MSE
                val_mse = mean_squared_error(actual_valid, preds_valid)
                log_print(f"Iteration {i + 1}: Validation MSE: {val_mse:.6f}")

                # Check if this is the best model so far
                if val_mse < best_mse:
                    best_mse = val_mse
                    # Save the model
                    with open(best_model_path, 'wb') as f:
                        pickle.dump(model_fit, f)
                    log_print(f"New best model saved at '{best_model_path}' with Validation MSE {best_mse:.6f}")
            except Exception as e:
                log_print(f"Error during validation prediction: {e}")

 # After training iterations, log the best model info
log_print(f"\nTraining completed. Best Validation MSE: {best_mse:.6f}")
log_print(f"Best model saved at: '{best_model_path}'")



log_print(f"Metrics saved at '{metrics_path}'")

###Evalute

In [12]:
with open(best_model_path, 'rb') as f:
    model_fit = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'model_history/bit_sen_best_model.pt'

In [14]:
# Forecast test period
preds_test_scaled = model_fit.predict(
    start=len(train_scaled),  # Start after training data
    end=len(train_scaled)+len(test_scaled[:,:-1])-1,
    exog=test_scaled[:,:-1]
)
preds_test_scaled = preds_test_scaled.reshape(-1, 1)
preds_test = scaler.inverse_transform(preds_test_scaled)
actual_test = scaler.inverse_transform(test_scaled[:,-1].reshape(-1,1))

# Calculate metrics
test_mse_model = mean_squared_error(actual_test, preds_test)
test_mae_model = mean_absolute_error(actual_test, preds_test)
test_rmse_model = np.sqrt(test_mse_model)

# Baseline Predictions: Last value from training set
last_known = scaler.inverse_transform(train_scaled[-1].reshape(1, -1))
baseline_predictions = np.array([last_known.flatten()[0]] * len(test_df)).reshape(-1, 1)

test_mse_baseline = mean_squared_error(actual_test, baseline_predictions)
test_mae_baseline = mean_absolute_error(actual_test, baseline_predictions)
test_rmse_baseline = np.sqrt(test_mse_baseline)

# Log Test Metrics
log_print("\nTest Metrics:")
log_print("-------------")
if include_baseline:
    log_print("Baseline Model:")
    log_print(f"  MAE  : {test_mae_baseline:.6f}")
    log_print(f"  MSE  : {test_mse_baseline:.6f}")
    log_print(f"  RMSE : {test_rmse_baseline:.6f}\n")
log_print(f"{model_name} Model:")
log_print(f"  MAE  : {test_mae_model:.6f}")
log_print(f"  MSE  : {test_mse_model:.6f}")
log_print(f"  RMSE : {test_rmse_model:.6f}\n")

if include_baseline:
    # Compare model and baseline in terms of MSE
    mse_diff = test_mse_model - test_mse_baseline
    mse_percentage = (mse_diff / test_mse_baseline) * 100

    if mse_percentage > 0:
        log_print(f"Model MSE is worse by {mse_percentage:.2f}% compared to the baseline.")
    else:
        log_print(f"Model MSE is better by {abs(mse_percentage):.2f}% compared to the baseline.")




Test Metrics:
-------------
Baseline Model:
  MAE  : 4935.654000
  MSE  : 25210726.579667
  RMSE : 5021.028438

Arima Model:
  MAE  : 817.259873
  MSE  : 986773.789911
  RMSE : 993.364883

Model MSE is better by 96.09% compared to the baseline.


###Figure

In [None]:
indices = np.arange(len(actual_test))

# Create the plot
plt.figure(figsize=(14, 7))

# Plot Actual Prices
plt.plot(indices, actual_test, label='Actual Price', marker='o', linestyle='-', color='blue')

# Plot Predicted Prices from ARIMA Model
plt.plot(indices, preds_test, label=f'{model_name} Predictions', marker='x', linestyle='--', color='red')

if include_baseline:
    # Plot Baseline Predictions
    plt.plot(indices, baseline_predictions, label='Baseline Prediction', marker='s', linestyle=':', color='green')

# Enhancements
plt.xlabel('Sample Index', fontsize=14)
plt.ylabel('Price', fontsize=14)
plt.title(f'Actual vs. Predicted{" vs. Baseline" if include_baseline else ""} Prices (Test Set)',
          fontsize=16)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()
# Save the plot with forecast_history in the filename
plot_filename = f"{csv_base}_{forecast_history}_{forecast_horizon}_test_plot.png"
plot_path = result_history_dir / plot_filename
plt.savefig(plot_path)
plt.close()
log_print(f"Plot saved to '{plot_path}'")




log_print(f"\nEvaluation completed for '{csv_base}' with model '{model_name}' and forecast_history={forecast_history}.")
log_print(f"Metrics and plot appended to '{metrics_path}' and saved plot at '{plot_path}'")