<a href="https://colab.research.google.com/github/stereifberger/trader_news_tone/blob/main/Trader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminary

## GitHub Integration

In [None]:
!git clone "https://github.com/stereifberger/trader_news_tone"
%cd /content/trader_news_tone

In [None]:
!git config --global user.email "sterei@outlook.com"
!git config --global user.name "stereifberger"
!git add .
!git commit -m "Added saving and loading datasets"
!git push

## Libraries

In [None]:
!pip install alpaca-trade-api
!pip install newsapi-python
#!pip install openai

In [13]:
import alpaca_trade_api as tradeapi
from alpaca_trade_api.rest import TimeFrame
from alpaca_trade_api.rest import REST
from alpaca_trade_api.rest import TimeFrame
import time
import matplotlib.pyplot as plt
import pickle
import importlib
import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import yfinance as yf
import newsapi
from newsapi import NewsApiClient
from transformers import BertTokenizer, BertModel
import random
from datetime import datetime, timedelta
from keywords_2 import keywords
import numpy as np
#import openai
#from openai import OpenAI

## Key, Save and Load Data

In [None]:
api = REST('PUBLIC_ALPACA', 'SECRET_ALPACA', 'https://paper-api.alpaca.markets')
newsapi = NewsApiClient(api_key='NEWS_API')

In [7]:
# File paths to save your data
stock_dataframe_path = "/content/stock_dataframes.pkl"
avaiable_stocks_path = "avaiable_stocks.pkl"
news_path = 'news.pkl'
dataset_path = "/content/trader_news_tone/dataset.pkl"

In [None]:
#Save stock_dataframes (pandas dataframe)
#with open(stock_dataframe_path, 'wb') as f:
#    pickle.dump(stock_dataframes, f)

In [8]:
# Load stock_dataframes
with open(stock_dataframe_path, 'rb') as f:
    stock_dataframes = pickle.load(f)

# News

## Fetch News

In [15]:
newsapi_client = NewsApiClient(api_key='KEY')

In [16]:
def fetch_news(num_articles=5, hours_back=24, batch_size=100):
    # Define the time window (past 'hours_back' hours)
    end_date = datetime.now()
    start_date = end_date - timedelta(hours=hours_back)

    all_articles_list = []
    current_start = start_date

    while len(all_articles_list) < num_articles:
        # Define the current end date for the batch
        current_end = min(current_start + timedelta(hours=hours_back / (num_articles // batch_size)), end_date)

        # Format the dates as 'YYYY-MM-DDTHH:MM:SS'
        current_start_str = current_start.strftime('%Y-%m-%dT%H:%M:%S')
        current_end_str = current_end.strftime('%Y-%m-%dT%H:%M:%S')

        # Fetch a batch of financial news articles
        all_articles = newsapi_client.get_everything(q='finance OR stock',
                                              language='en',
                                              from_param=current_start_str,
                                              to=current_end_str,
                                              sort_by='publishedAt',
                                              page_size=batch_size)

        # Append the batch to the list
        all_articles_list.extend(all_articles['articles'])

        # Update the start date for the next batch
        current_start = current_end

        print(f"Fetched {len(all_articles_list)} articles so far.")

        # Break if we have enough articles
        if len(all_articles_list) >= num_articles:
            break

    # Convert the accumulated articles into a DataFrame
    df = pd.DataFrame(all_articles_list)

    # Randomly sample 'num_articles' from the pool, without replacement
    if len(df) > num_articles:
        df = df.sample(n=num_articles, replace=False)

    return df

In [None]:
# Fetch news data
news_df = fetch_news(2000, 720, 100)

## Map to stocks

In [18]:
tqdm.pandas(desc="Mapping to stocks")

def find_stock(content, keyword_dict):
    matched_stocks = []
    for stock, keywords in keyword_dict.items():
        if any(keyword in content for keyword in keywords):
            matched_stocks.append(stock)
    return matched_stocks if matched_stocks else []

In [None]:
news_df['stocks'] = news_df['content'].progress_apply(lambda x: find_stock(x, keywords))

## Embed News

In [None]:
# Load FinBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertModel.from_pretrained('yiyanghkust/finbert-tone')

In [21]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def generate_news_embeddings(news_df):
    tqdm.pandas(desc="Generating embeddings")

    # Apply the embedding generation function with progress bar
    news_df['embedding'] = news_df['content'].progress_apply(get_embedding)

    return news_df

In [None]:
# Generate embeddings
news_with_embeddings = generate_news_embeddings(news_df)

## Generate Keywords

In [None]:
# Replace with your OpenAI API key
client = OpenAI(api_key="KEY")
def generate_company_keywords(company_names):
  prompt = f"""
  Complete my python dictionary for all companies with as many keywords as you can, relevant to that companies market performance:
  'Apple': ['apple', 'iphone', 'macbook', 'semiconductor', 'ipad', 'pc', 'microsoft', 'ai', 'china', 'google'],
  'Google': ['google', 'search', 'android', 'alphabet', 'apple', 'microsoft', 'openai', 'antitrust'],
  'Tesla': ['tesla', 'elon musk', 'electric vehicle', 'model 3', 'bmw', 'car', 'vw'],
  """
  i = 0
  asset_junks = []

  for name in company_names:
    for company in company_names:
      prompt += f"'{company}': ,\n"


  completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": f"{prompt}"}
    ]
  )

  returns = completion.choices[0].message.content
  return returns

In [None]:
i = 0
asset_junks = []

while i < len(asset_names):
    asset_junks.append([asset_names[i:i+20]])
    i += 20

for assets in tqdm(asset_junks):
  generated_keywords = generate_company_keywords(assets)
  with open(f"keywords.txt", 'a') as f:
    f.write(generated_keywords)

# Fetch Market Data

In [23]:
# Convert 'publishedAt' to datetime format
news_df['publishedAt'] = pd.to_datetime(news_df['publishedAt'])

# Filter out entries where 'publishedAt' is before 8 AM (hour < 8)
news_df = news_df[news_df['publishedAt'].dt.hour >= 8]

# Filter out weekends (Saturday=5, Sunday=6)
news_df = news_df[~news_df['publishedAt'].dt.weekday.isin([5, 6])]


In [24]:
def fetch_alpaca_day_data(stock, published_date, timeframe):
    start_date = published_date.strftime('%Y-%m-%d')
    end_date = (published_date + timedelta(days=1)).strftime('%Y-%m-%d')  # End date is the next day

    try:
        # Fetch the entire day's data
        bars = api.get_bars(stock, timeframe, start=start_date, end=end_date).df

        # Check if we received any data
        if bars.empty:
            #print(f"No market data for {stock} on {start_date}.")
            return pd.DataFrame()  # Return empty DataFrame if no data

        # Ensure the index has the same timezone as the published_time
        if bars.index.tz is None:
            bars.index = bars.index.tz_localize('UTC')  # Assuming the data is in UTC

        return bars

    except Exception as e:
        #print(f"Error fetching Alpaca data for {stock}: {e}")
        return pd.DataFrame()  # Return empty DataFrame to skip this stock


In [None]:
def fetch_yahoo_data(stock):
    """Fetches profit, sales, and EBITDA for the given stock."""
    stock_data = {}
    try:
        # Fetch financial data using Yahoo Finance
        ticker = yf.Ticker(stock)

        # Get the latest financial data
        financials = ticker.financials
        stock_data['profit'] = financials.loc['Gross Profit'].iloc[0] if 'Gross Profit' in financials.index else None
        stock_data['sales'] = financials.loc['Total Revenue'].iloc[0] if 'Total Revenue' in financials.index else None
        stock_data['ebitda'] = financials.loc['EBITDA'].iloc[0] if 'EBITDA' in financials.index else None

    except Exception as e:
        #print(f"Error fetching Yahoo data for {stock}: {e}")
        stock_data = {'profit': None, 'sales': None, 'ebitda': None}  # Return None for all values on error

    return stock_data

In [None]:
def generate_market_data(news_df, X, Y, Z):
    market_data_dict = {}

    for _, row in tqdm(news_df.iterrows(), total=len(news_df)):
        published_time = pd.to_datetime(row['publishedAt'])

        # Ensure the published_time is timezone-aware (assuming it's in UTC)
        if published_time.tz is None:
            published_time = published_time.tz_localize('UTC')

        published_date = published_time.date()
        stock_list = row['stocks']
        embedding = row['embedding']  # Get the embedding for this news entry

        # Ensure that stock_list is a list; if it's a string, convert it to a list by splitting on commas
        if isinstance(stock_list, str):
            stock_list = stock_list.strip("[]").replace("'", "").split(", ")

        for stock in stock_list:
            stock = stock.strip()

            # Fetch Alpaca data for the entire publication day
            alpaca_day_data = fetch_alpaca_day_data(stock, published_time, tradeapi.TimeFrame.Minute)

            # Skip if the Alpaca data is empty
            if alpaca_day_data.empty:
                #print(f"No market data for {stock} on the given day. Skipping to the next stock.")
                continue

            # Filter the X minutes of data before the published time
            filtered_data = alpaca_day_data[
                (alpaca_day_data.index >= published_time - timedelta(minutes=X)) &
                (alpaca_day_data.index <= published_time + timedelta(minutes=Y))
            ]


            # Skip if we don't have enough data in the X-minute window
            if filtered_data.empty:
                #print(f"Not enough data for {stock} within the X-minute window. Skipping.")
                continue

            # Fetch Yahoo Finance data
            yahoo_data = fetch_yahoo_data(stock)

            # Create a list to store rows for the current stock
            current_stock_data = []

            # Create rows for each minute of data within the X minutes
            for i in range(len(filtered_data[:-Y])):
                # Create a dictionary for each minute's data
                row_data = {
                    'stock': stock,
                    'publishedAt': row['publishedAt'],
                    'open': filtered_data.iloc[i]['open'],
                    'high': filtered_data.iloc[i]['high'],
                    'low': filtered_data.iloc[i]['low'],
                    'close': filtered_data.iloc[i]['close'],
                    'volume': filtered_data.iloc[i]['volume'],
                    'profit': yahoo_data['profit'],
                    'sales': yahoo_data['sales'],
                    'ebitda': yahoo_data['ebitda'],
                    'embedding': embedding
                }
                current_stock_data.append(row_data)

            if len(filtered_data) >= X + Y:
                # Generate the future price and target based on price change
                last_time = filtered_data.close.iloc[-Y]
                future_time = filtered_data.close.iloc[-1]
                price_increase = future_time - last_time / last_time * 100

                # Add a final row with NaN values except for 'target'
                final_row = {
                    'stock': stock,
                    'publishedAt': row['publishedAt'],
                    'open': None,
                    'high': None,
                    'low': None,
                    'close': None,
                    'volume': None,
                    'profit': None,
                    'sales': None,
                    'ebitda': None,
                    'embedding': None,
                    'target': (price_increase > Z).astype(int)
                }
                current_stock_data.append(final_row)
                # Convert the current stock's data to a DataFrame
                stock_df = pd.DataFrame(current_stock_data)

                # Store the DataFrame in the dictionary with key as (stock, publishedAt)
                market_data_dict[(stock, published_time)] = stock_df
                time.sleep(3)
            else:
                continue

    return market_data_dict

# Example usage
X = 30  # Last X minutes of stock data
Y = 20  # Y minutes after the last market data to check price increase
Z = 1   # Z percent price increase threshold

# Assuming news_df is already loaded with relevant columns
market_data_dict = generate_market_data(news_df.head(100), X, Y, Z)

In [None]:
# Save stock_dataframes
with open(dataset_path, 'wb') as f:
    pickle.dump(market_data_dict, f)

In [None]:
# Load stock_dataframes
with open(dataset_path, 'rb') as f:
    market_data_dict = pickle.load(f)

In [None]:
# Define the maximum length for padding
max_embedding_length = 128  # Set this to your desired length

# Prepare the dataset
data = []

# Traverse through the dictionary to process each dataframe
for (stock, timestamp), df in market_data_dict.items():
    # Ensure the dataframe is sorted by time if necessary
    df = df.sort_index()

    # Determine the size of the series (assume you're using the first n-1 indices for features)
    feature_end_index = len(df)-2
    target_index = len(df)-1  # Assume index 3 is where n+1 target value lies

    for idx in range(len(df) - target_index):
        # Slice the relevant series for each feature up to n-1
        open_series = df.iloc[idx:feature_end_index]['open'].tolist()
        high_series = df.iloc[idx:feature_end_index]['high'].tolist()
        low_series = df.iloc[idx:feature_end_index]['low'].tolist()
        close_series = df.iloc[idx:feature_end_index]['close'].tolist()
        volume_series = df.iloc[idx:feature_end_index]['volume'].tolist()
        profit_series = df.iloc[idx:feature_end_index]['profit'].tolist()
        sales_series = df.iloc[idx:feature_end_index]['sales'].tolist()
        ebitda_series = df.iloc[idx:feature_end_index]['ebitda'].tolist()

        # Fetch the target for n+1
        target_value = df.iloc[target_index]['target']

        # Get embedding vector for last reliable time step (assume last in series for simplicity)
        embedding_vector = df.iloc[feature_end_index]['embedding']

        # Only consider rows with proper sequence and no missing embedding
        if embedding_vector is not None:
            # Convert embedding to a numpy array and pad it
            embedding_array = np.array(embedding_vector)
            # Pad the embedding array to max_embedding_length
            padded_embedding = np.pad(embedding_array, (0, max(0, max_embedding_length - len(embedding_array))), 'constant')

            # Add to data list
            data.append({
                'stock_under_timestamp': f"{stock}_{timestamp}",
                'open': open_series,
                'high': high_series,
                'low': low_series,
                'close': close_series,
                'volume': volume_series,
                'profit': profit_series,
                'sales': sales_series,
                'ebitda': ebitda_series,
                'embedding': padded_embedding,  # Use padded embedding
                'target': target_value
            })

# Creating DataFrame
final_df = pd.DataFrame(data)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
final_df = final_df.drop('stock_under_timestamp', axis=1)

In [None]:
def pad_vector_column(df, column_name, max_length):
  """Pads vectors in a DataFrame column to a specified length."""
  padded_vectors = []
  for vector in df[column_name]:
    if len(vector) < max_length:
      padded_vector = np.pad(vector, (0, max_length - len(vector)), 'constant')
    else:
      padded_vector = vector[:max_length]
    padded_vectors.append(padded_vector)
  return padded_vectors

# Assuming 'embedding' column contains vectors
max_embedding_length = final_df['embedding'].apply(len).max()  # Find the maximum length of embedding

# Pad the 'open', 'high', 'low', 'close', 'volume', 'profit', 'sales', 'ebitda' vectors
columns_to_pad = ['open', 'high', 'low', 'close', 'volume', 'profit', 'sales', 'ebitda']
for column in columns_to_pad:
    final_df[column] = pad_vector_column(final_df, column, max_embedding_length)

In [None]:
def replace_nan_none_with_zero(df, columns_to_process):
    for column in columns_to_process:
        df[column] = df[column].apply(lambda vector: [0 if pd.isnull(x) or x is None else x for x in vector])
    return df

columns_to_process = ['open', 'high', 'low', 'close', 'volume', 'profit', 'sales', 'ebitda']
final_df = replace_nan_none_with_zero(final_df, columns_to_process)


In [None]:
new = final_df

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Iterate through the columns to normalize
for column in ['open', 'high', 'low', 'close', 'volume', 'profit', 'sales', 'ebitda']:
  # Flatten the list of vectors into a single array
  all_values = [value for sublist in new[column] for value in sublist]

  # Reshape the array to fit the scaler
  all_values_reshaped = np.array(all_values).reshape(-1, 1)

  # Fit and transform the scaler on all values in the column
  scaled_values = scaler.fit_transform(all_values_reshaped)

  # Reshape back to the original list of vectors
  new_values = []
  start_index = 0
  for vector in new[column]:
    end_index = start_index + len(vector)
    new_values.append(scaled_values[start_index:end_index].flatten().tolist())
    start_index = end_index

  # Replace the original column with the scaled values
  new[column] = new_values



In [None]:
def normalize_vector_column(df, column_name):
  """Normalizes all values within a vector column between 0 and 1."""
  all_values = []
  for vector in df[column_name]:
    all_values.extend(vector)

  min_val = np.min(all_values)
  max_val = np.max(all_values)

  normalized_vectors = []
  for vector in df[column_name]:
    normalized_vector = [(x - min_val) / (max_val - min_val) if max_val - min_val != 0 else 0 for x in vector]
    normalized_vectors.append(normalized_vector)
  return normalized_vectors

# Normalize the 'open', 'high', 'low', 'close', 'volume', 'profit', 'sales', 'ebitda' vectors
columns_to_normalize = ['open', 'high', 'low', 'close', 'volume', 'profit', 'sales', 'ebitda']
for column in columns_to_normalize:
  new[column] = normalize_vector_column(new, column)

In [None]:
def convert_vector_to_long(df, columns_to_convert):
  """Converts all vector values in the specified columns to long."""
  for column in columns_to_convert:
    df[column] = df[column].apply(lambda vector: [int(x) for x in vector])
  return df

columns_to_convert = ['open', 'high', 'low', 'close', 'volume', 'profit', 'sales', 'ebitda', embedding]
new = convert_vector_to_long(new, columns_to_convert)


## Get active Assets

In [None]:
active_assets = api.list_assets(status='active')  # you could leave out the status to also get the inactive ones

In [None]:
asset_names = [asset.name for asset in active_assets]

# Training and Evaluation Functions

In [None]:
class AsymmetricBCEWithLogitsLoss(nn.Module):
    def __init__(self, pos_weight=None, alpha=1.0):
        super(AsymmetricBCEWithLogitsLoss, self).__init__()
        self.pos_weight = pos_weight
        self.alpha = alpha  # Alpha is the penalty factor for false positives

    def forward(self, inputs, targets):
        # Compute standard BCE loss
        bce_loss = torch.nn.functional.binary_cross_entropy_with_logits(inputs, targets, pos_weight=self.pos_weight, reduction='none')

        # Compute probabilities using sigmoid
        probs = torch.sigmoid(inputs)

        # Modify the loss for false positives (when true label is 0 but predicted 1)
        modulated_loss = torch.where(targets == 0, self.alpha * bce_loss, bce_loss)  # Apply alpha only to false positives

        return modulated_loss.mean()

In [None]:
def calculate_confirmatory_factor(y_true, y_pred, predictions, precision):
    # Compute base probability of 1's (Prior P(H))
    base_prob = (y_true == 1).float().mean().item()  # proportion of 1s in the true labels

    # Calculate the Confirmatory Factor (CF)
    if base_prob > 0:  # Avoid division by zero
        confirmatory_factor = precision / base_prob
    else:
        confirmatory_factor = float('inf')  # If no 1's in the dataset

    return base_prob, confirmatory_factor

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class StockDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        # Prepare the training input (convert list to tensor)
        market_data = [torch.tensor(row['open'], dtype=torch.float),
                       torch.tensor(row['high'], dtype=torch.float),
                       torch.tensor(row['low'], dtype=torch.float),
                       torch.tensor(row['profit'], dtype=torch.float),
                       torch.tensor(row['sales'], dtype=torch.float),
                       torch.tensor(row['ebitda'], dtype=torch.float)]

        # Flatten the list of tensors into a single tensor for input
        input_tensor = torch.cat(market_data + [torch.tensor(row['embedding'])])

        # Prepare the target value as a tensor
        target_tensor = torch.tensor(row['target'], dtype=torch.float)

        # Return a dictionary of inputs and target
        return {'input': input_tensor, 'target': target_tensor}

# Create your dataset
stock_dataset = StockDataset(final_df)

# Create a DataLoader instance
batch_size = 16  # or any size you prefer
stock_dataloader = DataLoader(stock_dataset, batch_size=batch_size, shuffle=True)

# Iterate over DataLoader
for batch in stock_dataloader:
    inputs = batch['input']  # batched inputs
    targets = batch['target']  # batched targets

    # Now you can pass these inputs and targets to your model
    # e.g., outputs = model(inputs)

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
def train_and_evaluate(stock_dataframes, X, Y, Z, model_type, epochs, batch_size, learning_rate, architecture_params):
    # Prepare data
    data = stock_dataframes.to_numpy()

    features = data[:, 1:-1]  # All columns except the first (if it is an index) and last (target)
    targets = data[:, -1]     # Last column as the target

    # Split the data into features and targets
    features, targets = [], []
    for stock_data in data:
        if len(stock_data.shape) == 1:
            # If stock_data is a single row (vector), separate the features and target manually
            stock_features = stock_data[:-1]  # All elements except the last
            stock_target = stock_data[-1]     # The last element is the target
        else:
            # Normal case where stock_data has multiple rows
            stock_features = stock_data[:-1]  # All rows except the last
            stock_target = stock_data[-1][-1]  # Last row, last element is the target

        features.append(stock_features)
        targets.append(stock_target)

    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

    # Convert to tensors
    X_train = torch.tensor(features, dtype=torch.float32)
    y_train = torch.tensor(targets, dtype=torch.float32).unsqueeze(1)  # Add dimension for binary classification
    X_test = torch.tensor(features, dtype=torch.float32)
    y_test = torch.tensor(targets, dtype=torch.float32).unsqueeze(1)  # Add dimension for binary classification

    # Create DataLoader for batching
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    X_test = X_test.to(device)
    y_test = y_test.to(device)

    # Initialize model
    #model = get_model(model_type, **architecture_params)
    model = get_model(
    model_type,
    encoder_params=architecture_params['encoder_params'],
    decoder_params=architecture_params['decoder_params'],
    device=device
)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Calculate the positive class weight for imbalanced datasets
    num_ones = (y_train == 1).sum().item()
    num_zeros = (y_train == 0).sum().item()
    pos_weight = torch.tensor([num_zeros / num_ones], dtype=torch.float32).to(device)

    # Use BCEWithLogitsLoss with pos_weight
    criterion = AsymmetricBCEWithLogitsLoss(pos_weight=pos_weight, alpha=1.0)  # Alpha penalizes false positives
    # If your model's final layer does not include sigmoid, use this instead:
    #criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight).to(device)

    criterion = criterion.to(device)

    # Training loop with batching
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0

        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            outputs = model(batch_X, 1)

            # Ensure the output has the same shape as the target by squeezing the extra dimension

            outputs = torch.squeeze(outputs, dim=-1)  # Squeeze the last dimension (which is 1)

            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        avg_loss = epoch_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}')

    # Evaluation on test set
    #model.eval()
    with torch.no_grad():
        test_outputs = model(X_test, 1)
        #test_outputs = torch.sigmoid(test_outputs)
        # Ensure the output has the same shape as the target by squeezing the extra dimension
        test_outputs = torch.squeeze(test_outputs, dim=-1)  # Squeeze the last dimension (which is 1)
        test_loss = criterion(test_outputs, y_test).item()

        # Binarize predictions with a threshold of 0.5
        threshold = 0.5
        predictions = (test_outputs > threshold).float()
        print(test_outputs[:20])

        precision = precision_score(y_test.cpu(), predictions.cpu())
        recall = recall_score(y_test.cpu(), predictions.cpu())
        f1 = f1_score(y_test.cpu(), predictions.cpu())
        accuracy = ((predictions == y_test).float().mean()).item()

        # Calculate additional metrics (base_prob, confirmatory_factor)
        base_prob, confirmatory_factor = calculate_confirmatory_factor(y_test, test_outputs, predictions, precision)

    # Cleanup
    del model, optimizer, criterion
    torch.cuda.empty_cache()

    return test_loss, accuracy, precision, recall, f1, base_prob, confirmatory_factor

# Define parameter grid for grid search
#param_grid = {
#    'X': [10],  # Look-back window sizes
#    'Y': [5],  # Prediction horizon
#    'Z': [1],  # Price increase thresholds
#    'model_type': ['transformer'],  # Model types
#    'epochs': [10],
#    'batch_size': 64,
#    'learning_rate': 0.001,
#    'architecture': [{'input_dim': 768, 'num_heads': 2, 'num_layers': 2, 'output_dim': 1}]
#}
param_grid = {
    'X': [10],  # Look-back window sizes
    'Y': [5],   # Prediction horizon
    'Z': [1],   # Price increase thresholds
    'model_type': ['transformer'],  # Model types
    'epochs': [20],
    'batch_size': 128,
    'learning_rate': 0.001,
    'encoder_params': [{'input_dim': 768, 'emb_dim': 240, 'num_heads': 8, 'hidden_dim': 2048, 'num_layers': 6, 'dropout': 0}],
    'decoder_params': [{'output_dim': 1, 'emb_dim': 240, 'num_heads': 8, 'hidden_dim': 2048, 'num_layers': 6}]
}

# Run grid search
grid_search(new, param_grid)

In [None]:
def grid_search(stock_dataframes, param_grid):
    results = []

    for X in param_grid['X']:
        for Y in param_grid['Y']:
            for Z in param_grid['Z']:
                for model_type in param_grid['model_type']:
                    for epochs in param_grid['epochs']:
                        for architecture_params in param_grid['architecture']:
                            test_loss, accuracy, precision, recall, f1, base_prob, confirmatory_factor = train_and_evaluate(
                                stock_dataframes, X, Y, Z, model_type, epochs,
                                param_grid['batch_size'], param_grid['learning_rate'], architecture_params
                            )
                            result = {
                                'confirmatory_factor': confirmatory_factor,
                                'precision': precision,
                                'recall': recall,
                                'base_prob': base_prob,
                                'f1': f1,
                                'accuracy': accuracy,
                                'X': X,
                                'Y': Y,
                                'Z': Z,
                                'model_type': model_type,
                                'epochs': epochs,
                                'test_loss': test_loss
                            }
                            results.append(result)
                            print(result)

    df_results = pd.DataFrame(results)
    df_results.to_csv('training_results.csv', index=False)

In [None]:
def grid_search(stock_dataframes, param_grid):
    results = []

    for X in param_grid['X']:
        for Y in param_grid['Y']:
            for Z in param_grid['Z']:
                for model_type in param_grid['model_type']:
                    for epochs in param_grid['epochs']:
                        for encoder_params in param_grid['encoder_params']:
                            for decoder_params in param_grid['decoder_params']:
                                test_loss, accuracy, precision, recall, f1, base_prob, confirmatory_factor = train_and_evaluate(
                                    stock_dataframes, X, Y, Z, model_type, epochs,
                                    param_grid['batch_size'], param_grid['learning_rate'],
                                    {'encoder_params': encoder_params, 'decoder_params': decoder_params}
                                )
                                result = {
                                    'confirmatory_factor': confirmatory_factor,
                                    'precision': precision,
                                    'recall': recall,
                                    'base_prob': base_prob,
                                    'f1': f1,
                                    'accuracy': accuracy,
                                    'X': X,
                                    'Y': Y,
                                    'Z': Z,
                                    'model_type': model_type,
                                    'epochs': epochs,
                                    'test_loss': test_loss
                                }
                                results.append(result)
                                print(result)

    df_results = pd.DataFrame(results)
    df_results.to_csv('training_results.csv', index=False)


# Models

In [None]:
class LSTMEncoderDecoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMEncoderDecoder, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        # No sigmoid here

    def forward(self, x):
        _, (hidden, _) = self.encoder(x)
        output, _ = self.decoder(hidden[-1].unsqueeze(0))
        output = self.fc(output.squeeze(0))  # No sigmoid here
        return output  # Return raw logits

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers, output_dim):
        super(TransformerModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(input_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x shape: [sequence_length, batch_size, input_dim]
        x = self.encoder(x)  # Pass the entire sequence through the encoder

        # Select the output from the last time step
        x = x[:, -1, :]  # Shape: [batch_size, input_dim]

        # Apply the fully connected layer and sigmoid activation
        x = self.fc(x)  # Shape: [batch_size, output_dim]
        return self.sigmoid(x)


In [None]:
def get_model(model_type, **kwargs):
    if model_type == 'lstm':
        return LSTMEncoderDecoder(**kwargs)
    elif model_type == 'transformer':
        return TransformerModel(**kwargs)
    else:
        raise ValueError("Unknown model type: Choose either 'lstm' or 'transformer'")

In [None]:
def get_model(model_type, encoder_params, decoder_params, device):
    if model_type == 'lstm':
        return LSTMEncoderDecoder(**encoder_params)  # LSTM architecture (leave as is if using this option)
    elif model_type == 'transformer':
        encoder = TransformerEncoder(**encoder_params)  # Pass the encoder params
        decoder = TransformerDecoder(**decoder_params)  # Pass the decoder params
        return Seq2SeqTransformer(encoder, decoder, device)  # Use your new Transformer model
    else:
        raise ValueError("Unknown model type: Choose either 'lstm' or 'transformer'")

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, num_heads, hidden_dim, num_layers, dropout):
        super(TransformerEncoder, self).__init__()
        self.linear_input = nn.Linear(input_dim, emb_dim)  # Linear layer for continuous features
        encoder_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, src):
        embedded = self.linear_input(src)  # Apply the linear layer instead of embedding
        embedded = embedded.permute(1, 0, 2)  # [seq_len, batch_size, emb_dim]
        output = self.transformer_encoder(embedded)
        return output


class TransformerDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, num_heads, hidden_dim, num_layers, max_seq_len=100):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_len, emb_dim))
        decoder_layer = nn.TransformerDecoderLayer(d_model=emb_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(emb_dim, output_dim)

    def forward(self, tgt, memory):
        embedded = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :].to(tgt.device)
        embedded = embedded.permute(1, 0, 2)  # [seq_len, batch_size, emb_dim]
        output = self.transformer_decoder(embedded, memory)
        output = output.permute(1, 0, 2)  # [batch_size, seq_len, emb_dim]
        prediction = self.fc_out(output)
        return prediction

class Seq2SeqTransformer(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2SeqTransformer, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg_len):
        memory = self.encoder(src)
        tgt = torch.zeros((src.size(0), trg_len), device=self.device, dtype=torch.long)
        output = self.decoder(tgt, memory)
        return output

In [None]:
encoder_tra = TransformerEncoder(, 150, 5, 150, 1, dropout=0.1)
decoder_tra = TransformerDecoder(14, 150, 1, 150, 3)
tra_ed_model = Seq2SeqTransformer(encoder_tra, decoder_tra, device)

# Implementation

In [None]:
# Define parameter grid for grid search
param_grid = {
    'X': [10],  # Look-back window sizes
    'Y': [5],  # Prediction horizon
    'Z': [1],  # Price increase thresholds
    'model_type': ['transformer'],  # Model types
    'epochs': [40],
    'batch_size': 64,
    'learning_rate': 0.001,
    'architecture': [{'input_dim': 5, 'num_heads': 5, 'num_layers': 2, 'output_dim': 1}]
}

# Run grid search
grid_search(final_df, param_grid)

KeyError: 'size'