# Installation

In [1]:
!python -m pip install --upgrade pip

[0m

In [2]:
!pip install einops pandas pytest tqdm scipy pyarrow

[0m

## Flash-FFT-Conv

It's recommended to restart the kernel after installing FlashFFTConv to ensure successful installation

In [3]:
!git clone https://github.com/HazyResearch/flash-fft-conv.git
%cd flash-fft-conv
%cd csrc/flashfftconv
!python setup.py install
%cd ../..
!python setup.py install
%cd ../

fatal: destination path 'flash-fft-conv' already exists and is not an empty directory.
/workspace/l2/lyra/examples/flash-fft-conv
/workspace/l2/lyra/examples/flash-fft-conv/csrc/flashfftconv


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Found arch: sm_90 from existing torch installation
running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  self.initialize_options()
running bdist_egg
running egg_info
writing monarc

Sanity Check for FlashFFTConv

## Restart the kernel to ensure succesfful FlashFFTConv installation

In [4]:
# !pytest -s -q tests/test_conv1d.py
# !pytest -s -q tests/test_flashfftconv.py

# Imports

In [5]:
import numpy as np
import pandas as pd
import json
import math
import os
import random
import string
import sys
from datetime import datetime
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from flashfftconv import FlashDepthWiseConv1d
from einops import rearrange, repeat
from tqdm.auto import tqdm
from scipy.stats import spearmanr


# Calculate the path to the directory containing 'lyra_base'
current_dir = os.path.abspath('')
parent_dir = os.path.join(current_dir, '..', '..')
parent_dir_normalized = os.path.normpath(parent_dir)
sys.path.append(parent_dir_normalized)

from lyra import utils
from lyra.utils import data_encoders
from lyra.utils.data_encoders import one_hot_encode_protein, ProteinDataset
from lyra.nn import lyra_base
from lyra.nn.lyra_base import Lyra, lyra_example_proteins_config



dropout_fn = nn.Dropout1d
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
def preprocess_data(df):
    sequences = df.iloc[:, 0].values  # Extract the first column
    labels = df.iloc[:, 1].values   # Extract the second column
    encoded_sequences = one_hot_encode_protein(sequences)
    return encoded_sequences, torch.tensor(labels, dtype=torch.float32)

In [7]:
batch_size = 512

def load_data(file_path, has_header=False):
    if has_header:
        df = pd.read_csv(file_path)
    else:
        df = pd.read_csv(file_path, header=None, names=['sequence', 'label'])
    return df

def create_dataloaders(train_file, test_file, has_header=False, batch_size=512):
    train_df = load_data(train_file, has_header)
    test_df = load_data(test_file, has_header)

    # Preprocess and one-hot encode the data
    train_encoded, train_labels = preprocess_data(train_df)
    test_encoded, test_labels = preprocess_data(test_df)

    # Create train and test datasets with preprocessed data
    train_dataset = ProteinDataset(train_encoded, train_labels)
    test_dataset = ProteinDataset(test_encoded, test_labels)

    # Create Data Loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=8)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=8)

    return train_loader, test_loader

# File paths
datasets = {
    "GFP": ("../datasets/relso_data/GFP_train_data.csv", "../datasets/relso_data/GFP_test_data.csv"),
    "GB1_WU": ("../datasets/relso_data/GB1_WU_train_data.csv", "../datasets/relso_data/GB1_WU_test_data.csv"),
    "Gifford": ("../datasets/relso_data/gifford_train_data.csv", "../datasets/relso_data/gifford_test_data.csv")
}

# Create a dictionary of data loaders
dataloaders = {}
for task, (train_file, test_file) in datasets.items():
    dataloaders[task] = create_dataloaders(train_file, test_file, has_header=(task == "Gifford"))

In [8]:
# train_features, train_labels = next(iter(train_dataloader))
train_features, train_labels = next(iter(dataloaders['GFP'][0]))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([512, 237, 20])
Labels batch shape: torch.Size([512])


# Model Instantiation

This model instantiation is provided as a sanity check, the training loop in the next section will instantiate a new model for each task.

In [9]:
model = Lyra(**lyra_example_proteins_config).to(device)

In [10]:
# Print model architecture
print(model)

# Count total trainable parameters
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {num_params}")

Lyra(
  (encoder): Linear(in_features=20, out_features=64, bias=True)
  (pgc1): PGC(
    (in_proj): Linear(in_features=64, out_features=32, bias=True)
    (in_norm): RMSNorm()
    (conv): Conv1d(16, 16, kernel_size=(3,), stride=(1,), padding=(1,), groups=16)
    (flash_conv): FlashDepthWiseConv1d()
    (out_proj): Linear(in_features=16, out_features=64, bias=True)
    (out_norm): RMSNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (pgc2): PGC(
    (in_proj): Linear(in_features=64, out_features=256, bias=True)
    (in_norm): RMSNorm()
    (conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,), groups=128)
    (flash_conv): FlashDepthWiseConv1d()
    (out_proj): Linear(in_features=128, out_features=64, bias=True)
    (out_norm): RMSNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (s4d): S4D(
    (kernel): S4DKernel()
    (activation): GELU(approximate='none')
    (dropout): DropoutNd()
    (flashfftconv): FlashFFTConv()
    (output_linear): Sequential(
   

# Training and Evaluation Loops

In [11]:
rand_ID =  ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8))

for model_config in [lyra_example_proteins_config]:
    for task, (train_loader, test_loader) in dataloaders.items():
        print()
        print(model_config)
        print(f"Training on {task} dataset")
        num_epochs = 500
        max_spearman_corr = 0.00
        model = Lyra(**model_config).to(device)
        criterion = nn.MSELoss().to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001,weight_decay=0.01)
        pbar = tqdm(range(num_epochs), desc=f"Epoch 0: Train Corr: 0.0000, Val Corr: 0.0000, Loss: 0.0000")

        for epoch in pbar:
            # Lists for storing batch-wise Spearman correlation
            batch_train_spearman_corr = []
            batch_val_spearman_corr = []
        
            # Training phase
            model.train()
            for sequences, labels in train_loader:
                sequences = sequences.to(device)
                labels = labels.to(device).unsqueeze(1)
        
                # Forward pass
                outputs = model(sequences)
                loss = criterion(outputs, labels)
        
                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
                # Calculate batch-wise Spearman correlation
                if outputs.is_cuda:
                    outputs = outputs.cpu()
                    labels = labels.cpu()
                batch_corr = spearmanr(outputs.detach().numpy(), labels.detach().numpy()).statistic
                batch_train_spearman_corr.append(batch_corr)
        
            # Average Spearman correlation over all batches
            avg_train_spearman_corr = np.mean(batch_train_spearman_corr)
        
            # Validation phase
            model.eval()
            with torch.no_grad():
                for sequences, labels in test_loader:
                    sequences = sequences.to(device)
                    labels = labels.to(device).unsqueeze(1)
        
                    outputs = model(sequences)
        
                    # Calculate batch-wise Spearman correlation
                    if outputs.is_cuda:
                        outputs = outputs.cpu()
                        labels = labels.cpu()
                    batch_corr = spearmanr(outputs.numpy(), labels.numpy()).statistic
                    batch_val_spearman_corr.append(batch_corr)
        
            # Average Spearman correlation over all validation batches
            avg_val_spearman_corr = np.mean(batch_val_spearman_corr)
        
            # Update maximum Spearman correlation
            if avg_val_spearman_corr >= max_spearman_corr:
                max_spearman_corr = avg_val_spearman_corr
                # Save the model if desired
        
            pbar.set_description(f"Epoch {epoch}: Train Corr: {avg_train_spearman_corr:.4f}, Val Corr: {avg_val_spearman_corr:.4f}, Loss: {loss.item():.4f}")
        
        
        print(f'{task} Maximum Spearman Correlation: {max_spearman_corr:.4f}')
        


{'d_input': 20, 'd_output': 1, 'd_model': 64, 'dropout': 0.2}
Training on GFP dataset


Epoch 0: Train Corr: 0.0000, Val Corr: 0.0000, Loss: 0.0000:   0%|          | 0/500 [00:00<?, ?it/s]

GFP Maximum Spearman Correlation: 0.8592

{'d_input': 20, 'd_output': 1, 'd_model': 64, 'dropout': 0.2}
Training on GB1_WU dataset


Epoch 0: Train Corr: 0.0000, Val Corr: 0.0000, Loss: 0.0000:   0%|          | 0/500 [00:00<?, ?it/s]

GB1_WU Maximum Spearman Correlation: 0.6030

{'d_input': 20, 'd_output': 1, 'd_model': 64, 'dropout': 0.2}
Training on Gifford dataset


Epoch 0: Train Corr: 0.0000, Val Corr: 0.0000, Loss: 0.0000:   0%|          | 0/500 [00:00<?, ?it/s]

Gifford Maximum Spearman Correlation: 0.4823
