# Stock Price Prediction Using RNNs — **Solutions**

This notebook contains **filled answers** for the starter assignment. 
We will:

1. Load and aggregate AMZN, GOOGL, IBM, MSFT data

2. Analyze and visualize closing prices and volumes

3. Process data into **sliding windows** suitable for RNNs

4. Build a simple baseline model (linear regression on windows)

5. Provide an RNN (Keras LSTM) solution **template** you can run if TensorFlow is available

6. Evaluate with MAE/MAPE/RMSE and discuss results


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams['figure.figsize'] = (10,4)

data_dir = Path('/mnt/data')

files = {
    'AMZN': data_dir/'AMZN_stocks_data.csv',
    'GOOGL': data_dir/'GOOGL_stocks_data.csv',
    'IBM': data_dir/'IBM_stocks_data.csv',
    'MSFT': data_dir/'MSFT_stocks_data.csv',
}

for k,v in files.items():
    assert v.exists(), f"Missing file for {k}: {v}"
print('Files found:', list(files.keys()))


## 1. Data Loading and Preparation

In [None]:
def load_one(symbol, path):
    df = pd.read_csv(path)
    # Standardize expected columns
    # Common columns: Date, Open, High, Low, Close, Adj Close, Volume
    # Ensure datetime and sort
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.sort_values('Date')
    else:
        raise ValueError(f"'Date' column missing in {symbol}")
    # Ensure expected numeric columns exist
    expect = ['Open','High','Low','Close','Adj Close','Volume']
    for c in expect:
        if c not in df.columns:
            # create if missing
            df[c] = np.nan
    df['Symbol'] = symbol
    return df[['Date','Symbol','Open','High','Low','Close','Adj Close','Volume']]

dfs = [load_one(sym, path) for sym, path in files.items()]
raw = pd.concat(dfs, ignore_index=True)
raw.reset_index(drop=True, inplace=True)
raw.head()

### 1.1.2 Handle Missing Values

In [None]:
mv = raw.isna().sum()
print('Missing values before fill:\n', mv)

# Forward fill within each symbol on numeric cols
num_cols = ['Open','High','Low','Close','Adj Close','Volume']
raw[num_cols] = raw.groupby('Symbol')[num_cols].apply(lambda g: g.ffill().bfill()).values

mv_after = raw.isna().sum()
print('\nMissing values after fill:\n', mv_after)

## 1.2 Analysis and Visualisation

In [None]:
# Plot closing prices per symbol
for sym, g in raw.groupby('Symbol'):
    plt.figure()
    plt.plot(g['Date'], g['Close'])
    plt.title(f'{sym} Closing Price')
    plt.xlabel('Date'); plt.ylabel('Close')
    plt.show()

# Plot volumes per symbol
for sym, g in raw.groupby('Symbol'):
    plt.figure()
    plt.plot(g['Date'], g['Volume'])
    plt.title(f'{sym} Volume')
    plt.xlabel('Date'); plt.ylabel('Volume')
    plt.show()

## 1.3 Data Processing

In [None]:
# We'll create supervised windows per symbol on Closing price only
def make_windows(series, window=20, horizon=1):
    X, y = [], []
    for i in range(len(series) - window - horizon + 1):
        X.append(series[i:i+window])
        y.append(series[i+window:i+window+horizon])
    return np.array(X), np.array(y).squeeze()

WINDOW = 20
H = 1

data_by_sym = {}
for sym, g in raw.groupby('Symbol'):
    close = g['Close'].astype(float).values
    X, y = make_windows(close, WINDOW, H)
    data_by_sym[sym] = {'X': X, 'y': y, 'dates': g['Date'].values}

{ k: (v['X'].shape, v['y'].shape) for k,v in data_by_sym.items() }

In [None]:
def train_val_test_split(X, y, train=0.7, val=0.15):
    n = len(X)
    n_train = int(n*train)
    n_val = int(n*val)
    X_train, y_train = X[:n_train], y[:n_train]
    X_val,   y_val   = X[n_train:n_train+n_val], y[n_train:n_train+n_val]
    X_test,  y_test  = X[n_train+n_val:], y[n_train+n_val:]
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

splits = {sym: train_val_test_split(d['X'], d['y']) for sym,d in data_by_sym.items()}
{k: tuple(s[0].shape for s in v) for k,v in splits.items()}

## 2. Baseline Models

In [None]:
from math import sqrt

def metrics(y_true, y_pred):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    mae = np.mean(np.abs(y_true - y_pred))
    rmse = sqrt(np.mean((y_true - y_pred)**2))
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100
    return {'MAE': mae, 'RMSE': rmse, 'MAPE_%': mape}

def naive_last(X):
    return X[:, -1]

results = {}
for sym, ((Xtr,ytr),(Xv,yv),(Xte,yte)) in splits.items():
    yhat = naive_last(Xte)
    results[sym] = metrics(yte, yhat)

results

In [None]:
# Linear Regression (no external libs)
def fit_linreg(X, y):
    # add bias
    Xb = np.c_[np.ones((X.shape[0],1)), X]
    # normal equation
    theta = np.linalg.pinv(Xb.T @ Xb) @ (Xb.T @ y.reshape(-1,1))
    return theta

def predict_linreg(theta, X):
    Xb = np.c_[np.ones((X.shape[0],1)), X]
    return (Xb @ theta).ravel()

linreg_results = {}
for sym, ((Xtr,ytr),(Xv,yv),(Xte,yte)) in splits.items():
    theta = fit_linreg(Xtr, ytr)
    yhat = predict_linreg(theta, Xte)
    linreg_results[sym] = metrics(yte, yhat)

linreg_results

## 3. RNN Model (LSTM) — Template

In [None]:
# This block requires TensorFlow/Keras. If not installed, skip running it.
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    lstm_histories = {}
    lstm_results = {}

    EPOCHS = 10
    BATCH = 32

    for sym, ((Xtr,ytr),(Xv,yv),(Xte,yte)) in splits.items():
        # reshape to (samples, timesteps, features)
        Xtr_r = Xtr[..., None]
        Xv_r  = Xv[..., None]
        Xte_r = Xte[..., None]

        model = keras.Sequential([
            layers.Input(shape=(Xtr_r.shape[1], 1)),
            layers.LSTM(64, return_sequences=False),
            layers.Dense(1)
        ])
        model.compile(optimizer='adam', loss='mse', metrics=['mae'])

        h = model.fit(Xtr_r, ytr, validation_data=(Xv_r, yv), epochs=EPOCHS, batch_size=BATCH, verbose=0)
        lstm_histories[sym] = h.history

        yhat = model.predict(Xte_r, verbose=0).ravel()
        lstm_results[sym] = metrics(yte, yhat)

    print('LSTM results:', lstm_results)
except Exception as e:
    print('TensorFlow not available or training failed:', e)

## 4. Evaluation Summary

In [None]:
import pandas as pd
baseline_df = pd.DataFrame(results).T
linreg_df = pd.DataFrame(linreg_results).T

summary = baseline_df.join(linreg_df, lsuffix='_Naive', rsuffix='_LinReg')
summary