<a href="https://colab.research.google.com/github/singhtejn/Stock_AI_ML/blob/main/GPT_model_stock_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torch

In [15]:
import torch
from torch.utils.data import Dataset

class StockDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {'input_ids': self.features[idx], 'labels': self.labels[idx]}



In [12]:
from transformers import GPT2Tokenizer
import torch

# Data preprocessing for GPT-2
from transformers import GPT2Tokenizer

def preprocess_data(df, seq_length):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Add a padding token if not already present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

    data = df['Close'].values  # Assuming 'Close' price is used for forecasting
    data_str = [str(x) for x in data]  # Convert numeric data to string for tokenization

    # Tokenize the data
    tokenized_data = tokenizer(data_str, padding='max_length', truncation=True, max_length=seq_length, return_tensors='pt')

    return tokenized_data

# Example usage
def fetch_data(tickers, start, end):
    dfs = []
    for ticker in tickers:
        df = yf.download(ticker, start=start, end=end)
        df['Ticker'] = ticker
        df.reset_index(inplace=True)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

start_date = '2019-08-01'
end_date = '2024-07-31'
tickers = ['AAPL', 'MSFT', 'GOOGL']
df = fetch_data(tickers, start=start_date, end=end_date)
df = df[['Date', 'Close']]
tokenized_data = preprocess_data(df, seq_length=10)


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [14]:
from transformers import GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments

# Example usage
def main():
    start_date = '2019-08-01'
    end_date = '2024-07-31'
    tickers = ['AAPL', 'MSFT', 'GOOGL']
    df = fetch_data(tickers, start=start_date, end=end_date)
    df = df[['Date', 'Close']]

    tokenized_data = preprocess_data(df, seq_length=10)

    # Convert tokenized data to dataset format
    dataset = StockDataset(tokenized_data)

    # Load GPT-2 model
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    # Train the model
    trainer.train()

if __name__ == "__main__":
    main()


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [6]:
def generate_forecast(model, tokenizer, input_sequence):
    inputs = tokenizer(input_sequence, return_tensors='pt')
    outputs = model.generate(inputs['input_ids'], max_length=20, num_return_sequences=1)
    forecast = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return forecast

# Example usage
input_sequence = df['Close'].tail(10).values.tolist()
input_sequence_str = [str(x) for x in input_sequence]
forecast = generate_forecast(model, tokenizer, input_sequence_str)
print(forecast)


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [7]:
def main():
    # Fetch data and preprocess
    tickers = ['AAPL', 'MSFT', 'GOOGL']
    df = fetch_data(tickers, start=start_date, end=end_date)
    df = df[['Date', 'Close']]
    tokenized_data = preprocess_data(df, seq_length=10)

    # Train GPT-2 model
    model_name = 'gpt2'
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = torch.utils.data.TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'])
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=4,
        num_train_epochs=3,
        logging_dir='./logs',
        evaluation_strategy="steps",
        logging_steps=10,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )
    trainer.train()

    # Generate forecasts
    input_sequence = df['Close'].tail(10).values.tolist()
    input_sequence_str = [str(x) for x in input_sequence]
    forecast = generate_forecast(model, tokenizer, input_sequence_str)
    print(f"Forecast: {forecast}")

if __name__ == "__main__":
    main()


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [23]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Fetch historical OHLCV data for given stock tickers
def fetch_data(tickers, start, end):
    import yfinance as yf
    dfs = []
    for ticker in tickers:
        df = yf.download(ticker, start=start, end=end)
        df['Ticker'] = ticker
        df.reset_index(inplace=True)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Preprocess data and create labels
def preprocess_data(df, seq_length=10):
    df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']].copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

    # Normalize the data
    scaler = MinMaxScaler()
    df[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(df[['Open', 'High', 'Low', 'Close', 'Volume']])

    # Create labels based on price changes
    def create_labels(data, window=5):
        labels = []
        for i in range(len(data) - window):
            start_price = data['Close'].iloc[i]
            end_price = data['Close'].iloc[i + window]
            change = (end_price - start_price) / start_price
            if change >= 0.02:
                labels.append(1)  # Buy
            elif change <= -0.02:
                labels.append(2)  # Sell
            else:
                labels.append(0)  # Hold
        return labels

    labels = create_labels(df)

    # Prepare sequences
    sequences = []
    for i in range(len(df) - seq_length):
        sequence = df.iloc[i:i + seq_length].values
        sequences.append(sequence)

    # Ensure the length of features and labels are consistent
    if len(sequences) > len(labels):
        sequences = sequences[:len(labels)]
    elif len(sequences) < len(labels):
        labels = labels[:len(sequences)]

    return np.array(sequences), np.array(labels)

# Define custom dataset
class StockDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {'features': self.features[idx], 'labels': self.labels[idx]}

# Define model
class StockPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(StockPredictor, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=input_dim, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        self.lstm = nn.LSTM(input_size=64, hidden_size=hidden_dim, batch_first=True)
        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=4)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Change shape to (batch, channels, sequence_length) for CNN
        x = self.cnn(x)
        x = x.permute(0, 2, 1)  # Change shape back to (batch, sequence_length, channels) for LSTM
        x, _ = self.lstm(x)
        x, _ = self.attention(x, x, x)
        x = x[:, -1, :]  # Use the last hidden state for classification
        x = self.fc(x)
        return x

def main():
    # Define tickers and date range
    tickers = ['DRREDDY.NS', 'HINDALCO.NS', 'JSWSTEEL.NS']
    start_date = '2019-08-01'
    end_date = '2024-07-31'

    # Fetch and preprocess data
    df = fetch_data(tickers, start=start_date, end=end_date)
    features, labels = preprocess_data(df, seq_length=10)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    # Create datasets and dataloaders
    train_dataset = StockDataset(X_train, y_train)
    test_dataset = StockDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    # Initialize model, loss function, and optimizer
    input_dim = features.shape[2]  # Number of features
    hidden_dim = 128  # You can tune this
    output_dim = 3  # Number of classes (Buy, Sell, Hold)
    model = StockPredictor(input_dim, hidden_dim, output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Train the model
    model.train()
    for epoch in range(10):  # Number of epochs
        for batch in train_loader:
            optimizer.zero_grad()
            features, labels = batch['features'], batch['labels']
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            features, labels = batch['features'], batch['labels']
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print(classification_report(true_labels, predictions))
    print(f"Accuracy: {accuracy_score(true_labels, predictions)}")

if __name__ == "__main__":
    main()


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
  change = (end_price - start_price) / start_price


              precision    recall  f1-score   support

           0       0.44      0.75      0.56       261
           1       0.43      0.50      0.46       258
           2       0.00      0.00      0.00       219

    accuracy                           0.44       738
   macro avg       0.29      0.42      0.34       738
weighted avg       0.31      0.44      0.36       738

Accuracy: 0.43902439024390244


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
