# Stock Price Prediction using Spectrograms and Vision Transformers

## Project Overview

This repository implements a pipeline to predict stock price movements using spectrograms generated from financial time-series data. The model uses Vision Transformers (ViTs) trained on spectrograms to capture temporal and frequency patterns in stock prices.

---

## Project Steps

### 1. **Data Acquisition**
- Download daily adjusted close prices for S&P 500 stocks from Yahoo Finance.
- Timeframe: From 2000 to the most recent date.

### **Training and Testing Split**

	•	Training Data:
	•	Data from 2000 to 2014 was used for training.
	•	A total of 46,875 time series segments were sampled for training, with an 80-20 split between training and validation.
	•	Test Data:
	•	Data from 2016 to 2019 was used for testing.
	•	A total of 15,625 time series segments were sampled for testing.


In [1]:
import yfinance as yf
import pandas as pd

# Scarica dati storici per una lista di tickers
def download_sp500_data(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date, group_by='ticker', auto_adjust=True)
    return {ticker: data[ticker]['Close'].dropna() for ticker in tickers}

# Parametri
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META']  # Sostituisci con una lista più ampia
start_date = "2000-01-01"
end_date = "2023-12-31"

# Scarica i dati
sp500_data = download_sp500_data(tickers, start_date, end_date)

[*********************100%***********************]  5 of 5 completed


In [2]:
# Crea segmenti di lunghezza fissa
def create_segments(data, window_size=100, overlap=0.5):
    step = int(window_size * (1 - overlap))  # Calcola passo in base all'overlap
    segments = []
    for ticker, series in data.items():
        series = series.to_numpy()
        for i in range(0, len(series) - window_size + 1, step):
            segment = series[i:i + window_size]
            segments.append({'ticker': ticker, 'segment': segment})
    return segments

# Genera segmenti (50% overlap per aumentare i campioni)
segments = create_segments(sp500_data, window_size=100, overlap=0.5)
print(f"Numero totale di segmenti generati: {len(segments)}")

Numero totale di segmenti generati: 510


In [7]:
import numpy as np
import pywt
import matplotlib.pyplot as plt
import os

# Crea directory per salvare le immagini
output_dir = "/Users/roberto/Desktop/UNI_DATASCIENCE/FDS/FDS_final/spectrograms"
os.makedirs(output_dir, exist_ok=True)

# Genera spettrogrammi per i segmenti
def generate_and_save_spectrograms(segments, wavelet='cmor', scale_range=(1, 64)):
    for idx, segment_data in enumerate(segments):
        ticker = segment_data['ticker']
        segment = segment_data['segment']

        # Genera Continuous Wavelet Transform (CWT)
        scales = np.arange(scale_range[0], scale_range[1])
        coefficients, frequencies = pywt.cwt(segment, scales, wavelet)

        # Salva lo spettrogramma
        plt.figure(figsize=(8, 6))
        plt.imshow(
            np.abs(coefficients), 
            extent=[0, len(segment), scale_range[0], scale_range[1]], 
            cmap='jet', 
            aspect='auto', 
            origin='lower'
        )
        plt.colorbar(label='Magnitudo')
        plt.title(f"Spettrogramma - {ticker} - Segment {idx}")
        plt.xlabel("Tempo")
        plt.ylabel("Scala")
        
        # Salva l'immagine
        filename = f"{output_dir}/{ticker}_segment_{idx}.png"
        plt.savefig(filename)
        plt.close()

# Genera spettrogrammi
generate_and_save_spectrograms(segments)

  wavelet = DiscreteContinuousWavelet(wavelet)


In [None]:
import os
import random
import shutil

# Define directories
base_dir = "/Users/roberto/Desktop/UNI_DATASCIENCE/FDS/FDS_final/spectrograms"
output_dir = "data"

# Define split ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Define categories
categories = ["positive", "negative"]  # Adjust as needed

# Create output directories
for split in ["train", "val", "test"]:
    for category in categories:
        os.makedirs(os.path.join(output_dir, split, category), exist_ok=True)

# Split images
def split_data(base_dir, output_dir, categories, train_ratio, val_ratio):
    for category in categories:
        images = os.listdir(os.path.join(base_dir, category))
        random.shuffle(images)
        
        train_split = int(len(images) * train_ratio)
        val_split = int(len(images) * (train_ratio + val_ratio))

        for idx, image in enumerate(images):
            if idx < train_split:
                split = "train"
            elif idx < val_split:
                split = "val"
            else:
                split = "test"
            
            src = os.path.join(base_dir, category, image)
            dst = os.path.join(output_dir, split, category, image)
            shutil.copy(src, dst)

split_data(base_dir, output_dir, categories, train_ratio, val_ratio)

## Anoter Test:

In [None]:
# Preprocess and segment data
def preprocess_and_segment(data, window_size=100):
    segments = []
    for ticker, series in data.items():
        series = series.fillna(method="ffill")  # Fill missing values
        scaled_series = (series - series.min()) / (series.max() - series.min())  # Normalize
        for i in range(0, len(scaled_series) - window_size + 1, window_size):
            segment = scaled_series.iloc[i:i + window_size]
            segments.append({"ticker": ticker, "segment": segment})
    return segments

# Create 100-day segments
segments = preprocess_and_segment(data, window_size=100)
print(f"Generated {len(segments)} segments.")

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import pywt
import matplotlib.pyplot as plt
import os

In [3]:
# --- 1. Data Acquisition ---
def download_data(tickers, start_date, end_date):
    """""
    Download adjusted close prices for a list of tickers from Yahoo Finance.
    """
    data = yf.download(tickers, start=start_date, end=end_date, group_by='ticker', auto_adjust=True)
    return {ticker: data[ticker]['Close'].dropna() for ticker in tickers}

# Example tickers (use S&P 500 for actual implementation)
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META']
train_start, train_end = "2000-01-01", "2015-12-31"
test_start, test_end = "2016-01-01", "2019-12-31"

# Download training and test data
train_data = download_data(tickers, train_start, train_end)
test_data = download_data(tickers, test_start, test_end)

# --- 2. Data Preprocessing ---
def preprocess_data(data, window_size=100):
    """
    Preprocess data by normalizing and splitting into 100-day segments.
    """
    segments = []
    for ticker, series in data.items():
        # Forward fill missing values and normalize
        series = series.fillna(method='ffill')
        scaled_series = (series - series.min()) / (series.max() - series.min())
        
        # Create 100-day non-overlapping segments
        for i in range(0, len(scaled_series) - window_size + 1, window_size):
            segment = scaled_series.iloc[i:i + window_size]
            segments.append({'ticker': ticker, 'segment': segment})
    return segments

# Preprocess data
train_segments = preprocess_data(train_data)
test_segments = preprocess_data(test_data)

# --- 3. Spectrogram Generation ---
def generate_spectrogram(segment, output_dir, idx, wavelet='cmor'):
    """
    Generate a time-frequency spectrogram from a segment using wavelet transform.
    """
    data = segment.to_numpy()
    scales = np.arange(1, 64)  # Define wavelet scales
    coefficients, _ = pywt.cwt(data, scales, wavelet)
    
    # Create spectrogram augmented with time-series intensity stripe
    intensity_row = np.tile(data, (16, 1))  # Repeat the time series as a 2D row
    spectrogram = np.vstack([intensity_row, np.abs(coefficients)])  # Combine intensity and spectrogram

    # Save spectrogram as an image
    plt.figure(figsize=(6, 4))
    plt.imshow(spectrogram, extent=[0, len(data), 1, 64], cmap='jet', aspect='auto', origin='lower')
    plt.colorbar(label='Magnitude')
    plt.title(f"Spectrogram - Segment {idx}")
    plt.xlabel("Time")
    plt.ylabel("Scale")
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(f"{output_dir}/segment_{idx}.png")
    plt.close()

# Generate spectrograms for training data
output_dir = "train_spectrograms"
for idx, seg in enumerate(train_segments):
    generate_spectrogram(seg['segment'], output_dir, idx)

# Generate spectrograms for test data
output_dir = "test_spectrograms"
for idx, seg in enumerate(test_segments):
    generate_spectrogram(seg['segment'], output_dir, idx)

# --- 4. Prepare for Training/Test Split ---
def split_train_val(data, train_ratio=0.8):
    """
    Split data into training and validation sets.
    """
    np.random.shuffle(data)
    split_idx = int(len(data) * train_ratio)
    return data[:split_idx], data[split_idx:]

# Split into train/val for training data
train_set, val_set = split_train_val(train_segments)

# --- Summary of Dataset ---
print(f"Training Set: {len(train_set)} segments")
print(f"Validation Set: {len(val_set)} segments")
print(f"Test Set: {len(test_segments)} segments")

[*********************100%***********************]  5 of 5 completed
[*********************100%***********************]  5 of 5 completed
  series = series.fillna(method='ffill')
  wavelet = DiscreteContinuousWavelet(wavelet)


Training Set: 125 segments
Validation Set: 32 segments
Test Set: 50 segments


In [4]:
import pandas as pd

# Fetch S&P 500 tickers from Wikipedia
def get_sp500_tickers():
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    table = pd.read_html(url, header=0)[0]
    return table['Symbol'].tolist()

tickers = get_sp500_tickers()
print(f"Total S&P 500 tickers: {len(tickers)}")

Total S&P 500 tickers: 503


In [5]:
# Adjust the segment creation to include overlap
def preprocess_data_with_overlap(data, window_size=100, overlap=0.5):
    """
    Preprocess data by normalizing and splitting into overlapping 100-day segments.
    """
    step = int(window_size * (1 - overlap))  # Calculate step size based on overlap
    segments = []
    for ticker, series in data.items():
        # Forward fill missing values and normalize
        series = series.fillna(method='ffill')
        scaled_series = (series - series.min()) / (series.max() - series.min())
        
        # Create overlapping segments
        for i in range(0, len(scaled_series) - window_size + 1, step):
            segment = scaled_series.iloc[i:i + window_size]
            segments.append({'ticker': ticker, 'segment': segment})
    return segments

# Example with 50% overlap
train_segments = preprocess_data_with_overlap(train_data, window_size=100, overlap=0.5)
print(f"Number of training segments: {len(train_segments)}")

Number of training segments: 310


  series = series.fillna(method='ffill')


In [7]:
pip install tensorflow 

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,

In [10]:
# Check date range coverage for each stock
for ticker, series in train_data.items():
    print(f"{ticker}: Start - {series.index.min()}, End - {series.index.max()}")

AAPL: Start - 2000-01-03 00:00:00, End - 2015-12-30 00:00:00
MSFT: Start - 2000-01-03 00:00:00, End - 2015-12-30 00:00:00
GOOGL: Start - 2004-08-19 00:00:00, End - 2015-12-30 00:00:00
AMZN: Start - 2000-01-03 00:00:00, End - 2015-12-30 00:00:00
META: Start - 2012-05-18 00:00:00, End - 2015-12-30 00:00:00


In [8]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [9]:

# Example of data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True
)

# Augment images in batches during training
train_data = datagen.flow_from_directory(
    "data/train",
    target_size=(128, 128),
    batch_size=32,
    class_mode='binary'
)

FileNotFoundError: [Errno 2] No such file or directory: 'data/train'