### Intial code
for reference purposes

In [3]:
# import pandas as pd
# import numpy as np
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import StandardScaler
# from statsmodels.tsa.statespace.sarimax import SARIMAX
# from statsmodels.tools.sm_exceptions import ConvergenceWarning
# import warnings

# # Ignore convergence warnings
# warnings.simplefilter("ignore")

# # Load dataset with parsed dates
# data = pd.read_csv('../Data/SBI Train data.csv', parse_dates=['Date'], dayfirst=True)

# # Set the index to the Date column
# data.set_index('Date', inplace=True)
# # data = data.asfreq('D')
# # Feature Engineering: Add day of week and month
# data['day_of_week'] = data.index.dayofweek
# data['month'] = data.index.month

# # Add lagged value of the Close price and moving averages
# data['lagged_close'] = data['Close'].shift(1)  
# data['moving_avg_3'] = data['Close'].rolling(window=3).mean()
# data['moving_avg_7'] = data['Close'].rolling(window=7).mean()  # New: 7-day moving average for long-term trend

# # Add Volume as a feature (scaling might help)
# data['volume'] = data['Volume']

# # Drop rows with NaN values
# data.dropna(inplace=True)

# # Standardize the features (important for scaling)
# scaler = StandardScaler()
# exog_features = ['day_of_week', 'month', 'lagged_close', 'moving_avg_3', 'moving_avg_7', 'volume']
# data[exog_features] = scaler.fit_transform(data[exog_features])

# # Split the data into training and testing sets
# train_size = int(len(data) * 0.8)
# train, test = data.iloc[:train_size], data.iloc[train_size:]

# # Tune SARIMAX hyperparameters (ARIMA order (p, d, q))
# order = (2, 1, 2)  # Consider using AIC/BIC for finding optimal order
# seasonal_order = (1, 1, 1, 12)  # Adding seasonality with monthly frequency

# # Fit the SARIMAX model
# try:
#     model = SARIMAX(train['Close'], 
#                     exog=train[exog_features],
#                     order=order,
#                     seasonal_order=seasonal_order)
#     model_fit = model.fit(disp=False)
# except ConvergenceWarning as e:
#     print(f"Convergence warning: {e}")
# except Exception as e:
#     print(f"Error: {e}")

# # Forecasting
# forecast = model_fit.forecast(steps=len(test), exog=test[exog_features])

# # Calculate RMSE for forecast
# rmse_arimax = np.sqrt(mean_squared_error(test['Close'], forecast))
# print(f"Improved ARIMAX Model RMSE: {rmse_arimax}")

# test_prices = [i for i in test['Close']]
# # Check residuals diagnostics (optional)
# residuals = test_prices - forecast
# print("Mean of residuals:", residuals.mean())
# print("Standard deviation of residuals:", residuals.std())



### immporting necessary libraries

In [6]:
!pip install kaggle
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tools.sm_exceptions import ConvergenceWarning
import warnings



### Udating features to dataset for proper time-series analysis

In [8]:

# Ignore convergence warnings
warnings.simplefilter("ignore", ConvergenceWarning)

# Load training dataset with parsed dates
train_data = pd.read_csv('../Data/SBI Train data.csv', parse_dates=['Date'], dayfirst=True)

# Set the index to the Date column
train_data.index = pd.DatetimeIndex(train_data.index).to_period('M')

# Feature Engineering: Add day of week and month
train_data['day_of_week'] = train_data.index.dayofweek
train_data['month'] = train_data.index.month

# Add lagged value of the Close price and moving averages
train_data['lagged_close'] = train_data['Close'].shift(1)
train_data['moving_avg_3'] = train_data['Close'].rolling(window=3).mean()
train_data['moving_avg_7'] = train_data['Close'].rolling(window=7).mean()

# Add Volume as a feature (scaling might help)
train_data['volume'] = train_data['Volume']

# Drop rows with NaN values after applying the rolling window and lagging
train_data.dropna(inplace=True)

# Standardize the features
scaler = StandardScaler()
exog_features = ['day_of_week', 'month', 'lagged_close', 'moving_avg_3', 'moving_avg_7', 'volume']
train_data[exog_features] = scaler.fit_transform(train_data[exog_features])

# Split the data into training and testing sets
train_size = int(len(train_data) * 0.8)
train, validation = train_data.iloc[:train_size], train_data.iloc[train_size:]


### Training and savinng model

In [11]:
# Train the SARIMAX model
order = (2, 1, 2)
seasonal_order = (1, 1, 1, 12)

model = SARIMAX(train['Close'], exog=train[exog_features], order=order, seasonal_order=seasonal_order,enforce_invertibility=False)
model_fit = model.fit(disp=False)

#insert your folder name where you want the dataset to be downloaded instead of .kaggle
os.system('kaggle datasets download -d shristirwt/sarimax-model -p/.kaggle')
os.system('kaggle datasets download -d shristirwt/scaler-model -p/.kaggle')

# Save the model to a file using pickle
with open(r'C:\Users\SHRISTI\.kaggle\sarimax_model.pkl', 'wb') as f:
    pickle.dump(model_fit, f)

# Optionally save the scaler as well
with open(r'C:\Users\SHRISTI\.kaggle\scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Model and scaler saved successfully.")


Model and scaler saved successfully.


### Loading saved model

In [14]:
# Load the model and scaler from the files
with open(r'C:\Users\SHRISTI\.kaggle\sarimax_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

with open(r'C:\Users\SHRISTI\.kaggle\scaler.pkl', 'rb') as f:
    loaded_scaler = pickle.load(f)


### Loading and processing Test data

In [17]:
# Load the test dataset
test_data = pd.read_csv('../Data/SBI Test data.csv', parse_dates=['Date'], dayfirst=True)

# Set the index to the Date column
test_data.set_index('Date', inplace=True)

# Apply the same feature engineering on the test data
test_data['day_of_week'] = test_data.index.dayofweek
test_data['month'] = test_data.index.month
test_data['lagged_close'] = test_data['Close'].shift(1)
test_data['moving_avg_3'] = test_data['Close'].rolling(window=3).mean()
test_data['moving_avg_7'] = test_data['Close'].rolling(window=7).mean()

# Add Volume as a feature
test_data['volume'] = test_data['Volume']

# Drop rows with NaN values
test_data.dropna(inplace=True)

# Standardize the features in the test dataset using the loaded scaler
test_data[exog_features] = loaded_scaler.transform(test_data[exog_features])

### Predicting share prices using model

In [20]:
# Forecasting on the test data using the loaded model
forecast_test = loaded_model.forecast(steps=len(test_data), exog=test_data[exog_features])

# Calculate RMSE for forecast
rmse_test = np.sqrt(mean_squared_error(test_data['Close'], forecast_test))
print(f"Test Data RMSE: {rmse_test}")

# Check residuals diagnostics (optional)
test_prices = test_data['Close'].values
residuals_test = test_prices - forecast_test
print("Mean of residuals:", residuals_test.mean())
print("Standard deviation of residuals:", residuals_test.std())

Test Data RMSE: 4.883649507349637
Mean of residuals: 0.06489726947015648
Standard deviation of residuals: 4.8849520783077764
