# Time Series Analysis

# Table of Contents

1. Requirements and imports
2. Database Connection and Setup
3. Retrieve Data from MySQL
4. Sentiment Analysis (for all companies)
5. Retrieve Stockprice and left join with Sentiment (for AAPL)
6. Descriptive Statistics (for AAPL)
7. ARIMA Model (for all companies)
8. Neural Network Model (for AAPL)
9. LSTM (for all Companies)

# Requirements and imports

In [None]:
# !pip install -r requirements.txt

In [None]:
# imports
import datetime

import itertools

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
import keras_tuner as kt

import matplotlib.pyplot as plt

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import numpy as np

import os
import pandas as pd

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

import pymysql 

import seaborn as sns

from skimpy import skim

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sqlalchemy import create_engine 

from statsmodels.tsa.arima.model import ARIMA

from summarytools import dfSummary

import warnings

In [None]:
# Suppress all warnings
warnings.filterwarnings('ignore')

# Database Connection and Setup

### 1. MySQL Connection

In [None]:
# Get the data from the environment variables
hostname = os.getenv('MYSQLHOST')
port=int(os.getenv('MYSQLPORT'))
username = os.getenv('MYSQLUSR')
password = os.getenv('MYSQLPASS')
database_name = os.getenv('MYSQLDB') 
ca_cert_path = '../ca.pem'

# Create MySQL connection object
connection = pymysql.connect( 
    host=hostname, 
    port=port, 
    user=username, 
    password=password, 
    database=database_name, 
    ssl={'ca': ca_cert_path} 
)

In [None]:
# Create the SQLAlchemy engine for MySQL
engine = create_engine(f"mysql+pymysql://{username}:{password}@{hostname}:{port}/{database_name}")

### 2. MongoDB Connection

In [None]:
MONGOURI = os.getenv('MONGOURI')
MONGODB = os.getenv('MONGODB')

client = MongoClient(MONGOURI, server_api=ServerApi('1'))

# Select database by name.
# If database does not exist it will create a new one.
db = client[MONGODB]  

## MongoDB collections setup

In [None]:


# Collections used by this notebook (Select or create)
arima_coll = db["arima"]
lstm_coll = db["lstm"]
sentiment_coll = db["sentiment"]

# Clear the entire collection
res = arima_coll.delete_many({})
rest = lstm_coll.delete_many({})

# Clear the entire collection
# To make it simple to retrieve the data later on we clear the collection
# Only one is stored. 
# If we do not clear the collection to retrieve the data we have to use the timestamp
# to retrieve a specific observation
result = sentiment_coll.delete_many({})

# Retrieve Data from MySQL

## Company Tickers

In [None]:
query = """
SELECT 
    ticker
FROM 
    Company;
"""

company_df = pd.read_sql(query, engine)

tickers = company_df['ticker'].tolist()

# Stocktweet

In [None]:
# Stocktweet

query = """
SELECT * FROM Stocktweet;
"""

# Fetch data into a pandas DataFrame
stocktweet_df = pd.read_sql(query, engine)

In [None]:
stocktweet_df.head()

# Sentiment Analysis

In [None]:
# Download the VADER lexicon
nltk.download('vader_lexicon')

In [None]:
# Initialize the VADER Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()


In [None]:
def calculateSentiment(row):
    sentiment_scores = sia.polarity_scores(row['tweet'])
    return sentiment_scores['compound']

stocktweet_df['sentiment_score'] = stocktweet_df.apply(calculateSentiment, axis=1)

stocktweet_df.head()


In [None]:
stocktweet_grouped_df = stocktweet_df.groupby(['date', 'ticker'], as_index=False)['sentiment_score'].mean()
stocktweet_grouped_df.head()

In [None]:
stocktweet_grouped_df.dtypes

In [None]:
stocktweet_grouped_df['date'] = pd.to_datetime(stocktweet_grouped_df['date'], format='%d/%m/%Y')

In [None]:
stocktweet_grouped_df.dtypes

In [None]:
stocktweet_grouped_df.head()

## Save Sentiment scores into MongoDB

In [None]:
# Save Sentiment into MongoDB

# Convert DataFrame to dictionary
stocktweet_grouped_dict = stocktweet_grouped_df.to_dict(orient='records')

# Insert all documents into sentiment collection
res = sentiment_coll.insert_many(stocktweet_grouped_dict)

# Retrieve Stockprice and left join with Sentiment for AAPL

In [None]:
# Retrieve stockprice for a Company from MySQL
query = """
SELECT 
    Stockprice.ticker,
    Stockprice.Date,
    Stockprice.Open,
    Stockprice.High,
    Stockprice.Low,
    Stockprice.Close,
    Stockprice.AdjClose,
    Stockprice.Volume
    
FROM 
    Stockprice
WHERE
    Stockprice.ticker = 'AAPL'
"""

# Fetch data into a pandas DataFrame
stockprice_df = pd.read_sql(query, engine)
stockprice_df.head()

In [None]:
stockprice_df.dtypes

In [None]:
stockprice_df['Date'] = pd.to_datetime(stockprice_df['Date'], format='%Y-%m-%d')
stockprice_df.dtypes

In [None]:
stockprice_df.head()

# Apache Spark Preprocessing

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_trunc, avg, min, max
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

# Start a Spark session
spark = SparkSession.builder.appName("TimeSeriesPreprocessing").getOrCreate()

# Convert Pandas DataFrame to PySpark DataFrame
sdf = spark.createDataFrame(stockprice_df)

sdf = sdf.withColumn("Date", col("Date").cast("date"))

daily_data = sdf.groupBy(date_trunc("day", col("Date")).alias("date")) \
                .agg(
                    avg("value").alias("daily_mean"),
                    min("value").alias("daily_min"),
                    max("value").alias("daily_max")
                )

# 3. Fill missing values 
window_spec = Window.orderBy("Date")
daily_data = daily_data.withColumn("daily_mean_filled", 
                                   col("daily_mean").fillna(lag("daily_mean", 1).over(window_spec)))

# 4. Show the preprocessed data
daily_data.show()


spark.stop()

In [None]:
# Rename column Date to perform left join
stockprice_df.rename(columns={'Date': 'date'}, inplace=True)

In [None]:
# Performing a left join on both 'ticker' and 'date'
stockprice_merged_df = pd.merge(stockprice_df, stocktweet_grouped_df, on=['ticker', 'date'], how='left')
stockprice_merged_df.head()

In [None]:
stockprice_merged_df.fillna(0, inplace=True)
stockprice_merged_df.head()

In [None]:
stockprice_merged_df = stockprice_merged_df.drop(columns=['ticker'])

In [None]:
stockprice_merged_df.set_index('date', inplace=True)
stockprice_merged_df.head()

# Descriptive Statistics for AAPL

In [None]:
df = stockprice_merged_df

In [None]:
df.shape

In [None]:
df.describe(include="all")

In [None]:
skim(df)

In [None]:
dfSummary(df)

# Correlation between Close Price and Sentiment Score for AAPL

In [None]:
sns.scatterplot(data=df, x="Close", y="sentiment_score")

# ARIMA Model (all companies)

For each ticker, it calculate the forecast at 1d, 3d, and 7d and saves the data into Mongo.

The saved data will be used by the Dashboard notebook

In [None]:
# Function to evaluate ARIMA model with given order
def evaluate_arima_model(order):
    try:
        model = ARIMA(train, order=order)
        model_fit = model.fit()
        predictions = model_fit.forecast(steps=len(test))
        error = mean_squared_error(test, predictions)
        return error
    except:
        return float("inf")

for ticker in tickers:
    # Retrieve stockprice for a Company from MySQL
    query = """
        SELECT 
            Stockprice.Date,
            Stockprice.Close
        FROM 
            Stockprice
        WHERE
            Stockprice.ticker = '{}';
    """.format(ticker)

    # Fetch data into a pandas DataFrame
    stockprice_df = pd.read_sql(query, engine)

    # stockprice_df.fillna(0, inplace=True)
    stockprice_df.set_index('Date', inplace=True)
    print(ticker)


    # ----------------------------------------------------------------------
    # Hyperparameter Tuning
    # ----------------------------------------------------------------------
    
    # Define the range of p, d, and q values to try
    p_values = range(0, 5)
    d_values = range(0, 5)
    q_values = range(0, 5)

    # Generate all combinations of p, d, q
    pdq_combinations = list(itertools.product(p_values, d_values, q_values))

    # Train-test split
    train_size = int(len(stockprice_df) * 0.8)
    train, test = stockprice_df[:train_size], stockprice_df[train_size:]

    # Hyperparameter tuning
    best_score, best_order = float("inf"), None

    for order in pdq_combinations:
        error = evaluate_arima_model(order)
        if error < best_score:
            best_score, best_order = error, order
        print(f"ARIMA{order} MSE={error:.3f}")

    print(f"Best ARIMA{best_order} MSE={best_score:.3f}")

    # Fit and forecast using the best order
    model = ARIMA(train, order=best_order)
    model_fit = model.fit()


    # Forecast for 1 day
    forecast_1d = model_fit.forecast(steps=1)
    
    # Forecast for 3 days
    forecast_3d = model_fit.forecast(steps=3)

    # Forecast for 7 days
    forecast_7d = model_fit.forecast(steps=7)

    print("FORECAST 1D ")
    print(forecast_1d.iloc[0])
    print("FORECASTS 3D ")
    print(forecast_3d.iloc[2])
    print("FORECASTS 7D ")
    print(forecast_7d.iloc[6])
    
    
    doc = {"ticker": ticker,
         "1D": forecast_1d.iloc[0],
         "3D": forecast_3d.iloc[0],
         "7D": forecast_7d.iloc[0]}

    # Insert forecast into MongoDB Collection
    result = arima_coll.insert_one(doc)

    # Evaluate the model on test data
    forecast_values = model_fit.forecast(steps=len(test))
    mse = mean_squared_error(test, forecast_values)
    print(f"Mean Squared Error: {mse}")

    # Plot results
    plt.figure(figsize=(10, 6))
    plt.plot(stockprice_df.index, stockprice_df['Close'], label='Close price')
    plt.plot(test.index, forecast_values, label='Forecasted Close price', color='red')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.title('Close price')
    plt.legend()
    plt.show()

# Neural Network Model (for AAPL)

In [None]:
stockprice_merged_df.head()

In [None]:
df = stockprice_merged_df

data = df['Close'].values
data = data.reshape(-1, 1)

# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)

# Prepare training and test datasets
X = []
y = []
time_step = 60  # Use past 60 days to predict the next value

for i in range(time_step, len(scaled_data)):
    X.append(scaled_data[i-time_step:i, 0])
    y.append(scaled_data[i, 0])

X, y = np.array(X), np.array(y)
X = np.reshape(X, (X.shape[0], X.shape[1], 1))

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dense(units=25))
model.add(Dense(units=1))

# Compile and fit the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, batch_size=64, epochs=50)

# Make predictions
predictions = model.predict(X_test)
predictions = scaler.inverse_transform(predictions)




## LSTM Model Evaluation and Plot

In [None]:
y_test_actual = scaler.inverse_transform(y_test.reshape(-1, 1))

# Evaluate the model performance using RMSE
rmse = np.sqrt(mean_squared_error(y_test_actual, predictions))
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Plot the actual vs predicted stock prices
plt.figure(figsize=(10, 6))
plt.plot(y_test_actual, label='Actual Stock Price')
plt.plot(predictions, label='Predicted Stock Price')
plt.title('Stock Price Prediction - Actual vs Predicted')
plt.xlabel('Time')
plt.ylabel('Stock Price')
plt.legend()
plt.show()

# LSTM on all features (all Companies)

In [None]:
for ticker in tickers[:1]:
    # Retrieve stockprice for a Company from MySQL
    query = """
    SELECT 
        Stockprice.ticker,
        Stockprice.Date,
        Stockprice.Open,
        Stockprice.High,
        Stockprice.Low,
        Stockprice.Close,
        Stockprice.AdjClose,
        Stockprice.Volume
    FROM 
        Stockprice
    WHERE
        Stockprice.ticker = '{}'
    """.format(ticker)
    print(ticker)

    # Fetch data into a pandas DataFrame
    stockprice_df = pd.read_sql(query, engine)

    stockprice_df['Date'] = pd.to_datetime(stockprice_df['Date'], format='%Y-%m-%d')

    # Rename column Date to perform left join
    stockprice_df.rename(columns={'Date': 'date'}, inplace=True)

    # Performing a left join on both 'ticker' and 'date'
    stockprice_merged_df = pd.merge(stockprice_df, stocktweet_grouped_df, on=['ticker', 'date'], how='left')

    stockprice_merged_df.fillna(0, inplace=True)

    stockprice_merged_df = stockprice_merged_df.drop(columns=['ticker'])

    stockprice_merged_df.set_index('date', inplace=True)





    # Selecting the features 
    data = stockprice_merged_df[['Open', 'High', 'Low', 'Close', 'AdjClose', 'Volume', 'sentiment_score']]

    # Scale the data 
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(data)

    # Define a function to create a dataset with time steps and multiple outputs (1-day, 3-days, 7-days)
    def create_dataset(data, time_step=60, forecast_days=[1, 3, 7]):
        X, y = [], []
        for i in range(time_step, len(data) - max(forecast_days)):
            X.append(data[i-time_step:i, :])  # Input features from previous 'time_step' days
            # Target: Close price for 1-day, 3-days, and 7-days in the future
            y.append([data[i + forecast_day, 3] for forecast_day in forecast_days])
        return np.array(X), np.array(y)

    # Create the dataset
    time_step = 60  # Look back 60 days to predict the next day
    forecast_days = [1, 3, 7]  # Predict 1-day, 3-days, and 7-days ahead for Close price
    X, y = create_dataset(scaled_data, time_step, forecast_days)

    # Split the dataset into training and testing sets (80% training, 20% testing)
    train_size = int(len(X) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Reshape input data for LSTM [samples, time_steps, features]
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2]))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2]))

    # Step 1: Define a function to build the LSTM model for hyperparameter tuning
    def build_model(hp):
        model = Sequential()

        # Hyperparameter for the number of LSTM units
        model.add(LSTM(units=hp.Int('units', min_value=50, max_value=200, step=60), 
                       return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])))
    
        # Hyperparameter for the dropout rate to prevent overfitting
        model.add(Dense(units=3))  # Output
    
        # Hyperparameter for optimizer learning rate
        model.compile(optimizer=hp.Choice('optimizer', values=['adam', 'rmsprop']),
                      loss='mean_squared_error')
    
        return model

    # Step 2: Use Keras Tuner to find the best hyperparameters
    tuner = kt.Hyperband(build_model, 
                         objective='val_loss', 
                         max_epochs=10, 
                         hyperband_iterations=2, 
                         directory='my_dir', 
                         project_name='lstm_tuning')

    # Step 3: Perform the hyperparameter search
    tuner.search(X_train, y_train, epochs=10, validation_split=0.2, batch_size=32)

    # Step 4: Get the best hyperparameters
    best_model = tuner.get_best_models(num_models=1)[0]
    best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

    print("Best Hyperparameters:")
    print(best_hyperparameters)

    # Step 5: Train the model with the best hyperparameters
    best_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

    # Step 6: Make predictions with the best model
    predictions = best_model.predict(X_test)

    # Inverse transform the predicted and actual Close prices to get back to the original scale
    predictions_transformed = []
    for i in range(predictions.shape[1]):  # For each forecast (1, 3, 7 days)
        predictions_transformed.append(
            scaler.inverse_transform(np.concatenate((np.zeros((predictions.shape[0], scaled_data.shape[1] - 1)), 
                                                     predictions[:, i].reshape(-1, 1)), axis=1))[:, 3]
        )

    # Inverse transform actual data
    y_test_transformed = []
    for i in range(y_test.shape[1]):  # For each forecast (1, 3, 7 days)
        y_test_transformed.append(
            scaler.inverse_transform(np.concatenate((np.zeros((y_test.shape[0], scaled_data.shape[1] - 1)), 
                                                     y_test[:, i].reshape(-1, 1)), axis=1))[:, 3]
        )

    # Evaluate the model performance using RMSE for 1-day, 3-day, and 7-day predictions
    rmse_1day = np.sqrt(mean_squared_error(y_test_transformed[0], predictions_transformed[0]))
    rmse_3day = np.sqrt(mean_squared_error(y_test_transformed[1], predictions_transformed[1]))
    rmse_7day = np.sqrt(mean_squared_error(y_test_transformed[2], predictions_transformed[2]))

    print(f'Root Mean Squared Error (1-day): {rmse_1day}')
    print(f'Root Mean Squared Error (3-day): {rmse_3day}')
    print(f'Root Mean Squared Error (7-day): {rmse_7day}')

    doc = {"ticker": ticker,
         "1D": predictions_transformed[0][0],
         "3D": predictions_transformed[1][0],
         "7D": predictions_transformed[2][0]}

    print(doc)

    print("PREDICTIONS TRANSFORMED")
    print(predictions_transformed)
    
    # Insert forecast into MongoDB Collection
    # result = lstm_coll.insert_one(doc)

    # Plot the actual vs predicted Close prices for each forecast horizon
    plt.figure(figsize=(10, 8))

    # Plot for 1-day ahead prediction
    plt.subplot(3, 1, 1)
    plt.plot(y_test_transformed[0], label='Actual Close Price (1-day ahead)')
    plt.plot(predictions_transformed[0], label='Predicted Close Price (1-day ahead)')
    plt.title('1-Day Ahead Close Price Prediction')
    plt.xlabel('Time')
    plt.ylabel('Close Price')
    plt.legend()

    # Plot for 3-day ahead prediction
    plt.subplot(3, 1, 2)
    plt.plot(y_test_transformed[1], label='Actual Close Price (3-days ahead)')
    plt.plot(predictions_transformed[1], label='Predicted Close Price (3-days ahead)')
    plt.title('3-Days Ahead Close Price Prediction')
    plt.xlabel('Time')
    plt.ylabel('Close Price')
    plt.legend()

    # Plot for 7-day ahead prediction
    plt.subplot(3, 1, 3)
    plt.plot(y_test_transformed[2], label='Actual Close Price (7-days ahead)')
    plt.plot(predictions_transformed[2], label='Predicted Close Price (7-days ahead)')
    plt.title('7-Days Ahead Close Price Prediction')
    plt.xlabel('Time')
    plt.ylabel('Close Price')
    plt.legend()

    plt.tight_layout()
    plt.show()