# 20240215 
## SF
- adapting deepstarr architecture for US MPRA ATAC differentially accessible data.
- goal: train a simple CNN classifier to distiguish US-accessible (1) from Ctrl-accessible (0)
- Model would learn features of us-only open-chromatin sequences v. control-only open chromatin sequences.
- I may need to reformulate this goal as there are other chromatin elements that might be sensitive to US exposure, but may already be accessible.
    - As well, there may be many sequences that are only active in the control setting and retain accessibility during US exposure, but have repressed activity during ultrasound treatment

### Notes

- some issue with F1 score from torcheval.metrics package. May need to change.
- I have stolen the parameters from the deepstarr model. Some of these parameters might need to change.
- I have stolen some features from the legnet model, including the onecycler learning rate scheduler. In this case, the make learning rate will be the static learning rate used from deepSTARR. In application, the learning rate may be even smaller, but not larger than the learning rate parameter from the deepSTARR model.
- I like the parameter dictionary as a way to specify values. I have added many key:value pairs to accomodate information for the US learning task, such as the sequence size, the number of channels (in and out), the poolsize (hard-coded in deepstarr), and the weight decay, which is a regularization parameter for the one-cycler learning rate scheduler used in legnet. 


https://github.com/bernardo-de-almeida/DeepSTARR/blob/main/DeepSTARR/DeepSTARR_training.ipynb

In [3]:
#from Bio.SeqIO.FastaIO import SimpleFastaParser
from collections import OrderedDict
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.utils.data import  Dataset
#from torcheval.metrics import classification#binary_f1_score

import random
import numpy as np
import pandas as pd
import torch.nn.functional as F

In [6]:
import sys
sys.path.append("/wynton/home/ahituv/fongsl/micromamba/envs/torch/lib/python3.11/site-packages/torcheval/metrics/classification")

In [7]:
import f1_score

In [8]:

device = (
    "cuda"
    if torch.cuda.is_available()
#    else "mps"
 #   if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


# neural network

In [37]:
params = {'batch_size': 128,
          'epochs': 100,
          'early_stop': 10,
          'kernel_size1': 7, # originally 7
          'kernel_size2': 3, # 3
          'kernel_size3': 5, # 5
          'kernel_size4': 3, # 3
          'lr': 0.002,
          'num_filters1': 256, #SF changed from num_filters to num_filters1
          'num_filters2': 60,
          'num_filters3': 60,
          'num_filters4': 120,
          'n_conv_layer': 4,
          'n_add_layer': 2,
          'dropout_prob': 0.4,
          'dense_neurons1': 256,
          'dense_neurons2': 256,
          'pad':'same', 
          'seq_size':271,  # SF added
          'n_channels':4,  # SF added
          'out_ch':1,  # SF added
          'pool_size':2, # SF added
          "weight_decay":0.01 # SFadded
         }

## DeepSTARR in Pytorch

In [38]:
def print_sizes(model, input_tensor):
    output = input_tensor
    for m in model.children():
        output = m(output)
        print(m, output.shape)
    return output


In [42]:
"""
SF converted to pytorch from keras, tensorflow deepstarr model:
https://github.com/bernardo-de-almeida/DeepSTARR/blob/main/DeepSTARR/DeepSTARR_training.ipynb

def DeepSTARR(params=params):
    
    lr = params['lr']
    dropout_prob = params['dropout_prob']
    n_conv_layer = params['n_conv_layer']
    n_add_layer = params['n_add_layer']
    
    # body
    input = kl.Input(shape=(249, 4))
    x = kl.Conv1D(params['num_filters'], kernel_size=params['kernel_size1'],
                  padding=params['pad'],
                  name='Conv1D_1st')(input)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(2)(x)

    for i in range(1, n_conv_layer):
        x = kl.Conv1D(params['num_filters'+str(i+1)],
                      kernel_size=params['kernel_size'+str(i+1)],
                      padding=params['pad'],
                      name=str('Conv1D_'+str(i+1)))(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
        x = MaxPooling1D(2)(x)
    
    x = Flatten()(x)
    
    # dense layers
    for i in range(0, n_add_layer):
        x = kl.Dense(params['dense_neurons'+str(i+1)],
                     name=str('Dense_'+str(i+1)))(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
        x = Dropout(dropout_prob)(x)
    bottleneck = x
    
    # heads per task (developmental and housekeeping enhancer activities)
    tasks = ['Dev', 'Hk']
    outputs = []
    for task in tasks:
        outputs.append(kl.Dense(1, activation='linear', name=str('Dense_' + task))(bottleneck))

    model = keras.models.Model([input], outputs)
    model.compile(keras.optimizers.Adam(lr=lr),
                  loss=['mse', 'mse'], # loss
                  loss_weights=[1, 1], # loss weigths to balance
                  metrics=[Spearman]) # additional track metric

    return model, params
"""

class NeuralNetwork(nn.Module):
    def __init__(self,
                 param_dict):  # added param dict
        super().__init__()

        # SF added
        self.param_dict = param_dict
        self.in_ch = self.param_dict["seq_size"]*self.param_dict["n_channels"]
        self.in_ch = self.param_dict["n_channels"]
        self.out_ch = self.param_dict["out_ch"]
        

        # first layer
        self.first_hidden = nn.Sequential(
            # in channels, out channels, kernel size
            nn.Conv1d(
                self.in_ch, self.param_dict["num_filters1"], self.param_dict["kernel_size1"]),
            nn.BatchNorm1d(self.param_dict["num_filters1"]),
            nn.ReLU(),
            nn.MaxPool1d(self.param_dict["pool_size"])

        )

        # other layers
        blocks = []

        for i in range(1, self.param_dict["n_conv_layer"]):
            stack = nn.Sequential(
                # in channels, out channels, kernel size
                nn.Conv1d(
                    self.param_dict[f"num_filters{str(i)}"],
                    self.param_dict[f"num_filters{str(i+1)}"],
                    self.param_dict[f"kernel_size{str(i+1)}"]),
                nn.BatchNorm1d(self.param_dict[f"num_filters{str(i+1)}"]),
                nn.ReLU(),
                nn.MaxPool1d(self.param_dict["pool_size"])

            )
            blocks.append(stack)
        self.hidden = nn.Sequential(*blocks)

        fcs = []
        # fully connected layers

        self.output = nn.Sequential(
                        nn.Flatten(),
                        nn.Linear(self.param_dict[f"num_filters{str(i+1)}"],
                                   self.param_dict['dense_neurons1']),
                        nn.BatchNorm1d(self.param_dict['dense_neurons1']),
                        nn.ReLU(),
                        nn.Linear(self.param_dict['dense_neurons1'],
                                   self.param_dict['dense_neurons2']),
                        nn.BatchNorm1d(self.param_dict['dense_neurons2']),
                        nn.ReLU(),
                        
                        nn.Linear(self.param_dict['out_ch'],
                             self.param_dict['out_ch'])

        )

    def forward(self, x):
        x=self.first_hidden(x)
        x=self.hidden(x)

        x=self.output(x)
        return x

model = NeuralNetwork(params).to(device)
print(model)

NeuralNetwork(
  (first_hidden): Sequential(
    (0): Conv1d(4, 256, kernel_size=(7,), stride=(1,))
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (hidden): Sequential(
    (0): Sequential(
      (0): Conv1d(256, 60, kernel_size=(3,), stride=(1,))
      (1): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv1d(60, 60, kernel_size=(5,), stride=(1,))
      (1): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (0): Conv1d(60, 120, kernel_size=(3,), stride=(1,))
      (1): BatchNorm1d(120, eps=1e-05, momen

In [43]:
for i in model.parameters():
    print(i.shape)

torch.Size([256, 4, 7])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([60, 256, 3])
torch.Size([60])
torch.Size([60])
torch.Size([60])
torch.Size([60, 60, 5])
torch.Size([60])
torch.Size([60])
torch.Size([60])
torch.Size([120, 60, 3])
torch.Size([120])
torch.Size([120])
torch.Size([120])
torch.Size([256, 120])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256, 256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([1, 1])
torch.Size([1])


# load data, make predictions

In [11]:
DATA="/wynton/group/ahituv/data/US-MPRA/ATAC-seq/hepg2.training.nojoint.class.tsv"
OUT_CH = 1
val_fold=3
test_fold=5

# datamodule

In [12]:
# from datamodule
# open the dataframe
df = pd.read_csv(DATA,
         sep='\t', nrows=10)

# rename columns - in parts
seq_cols= ['seq_id', 'seq']
mean_value_cols = [f"mean_value{i+1}" for i in np.arange(0,OUT_CH)]
fold_rev_cols =  ['fold_num', 'rev']
seq_cols+= mean_value_cols+fold_rev_cols
df.columns = seq_cols[0:len(df.columns)]  # SF updated with second column


if "rev" in df.columns:
    df = df[df.rev == 0]
    
train = df[~df["fold_num"].isin([val_fold, test_fold])]
valid = df[df["fold_num"] == val_fold]
test = df[df["fold_num"] == test_fold]

# one hot

## read fasta

In [13]:
def makeDfFromFasta(fa_file):
    """SF: make pd dataframe from fasta file, 
        sequence is all uppercase
        requires biopython
    
    """
    # parse fasta file
    fa_dict = OrderedDict()  # dict to store names, and sequences
    
    with open(fa_file, "r") as reader:
        for value in SimpleFastaParser(reader):
            fa_dict[value[0]] = value[1].upper() # save sequences as upper case 
    
    # make dataframe from fasta
    fa_df = pd.concat(fa_dict.values())
    
    fa_df.columns = ["seq_id", "seq"]  # rename columns
    
    return fa_df
    

## one hot functions

In [14]:
# Taken from https://github.com/const-ae/Neural_Network_DNA_Demo/blob/master/helper/SequenceHelper.py
def to_categorical(y, nb_classes=None):
    '''Convert class vector (integers from 0 to nb_classes)
    to binary class matrix, for use with categorical_crossentropy
    '''
    y = np.asarray(y, dtype='int32')
    if not nb_classes:
        nb_classes = np.max(y) + 1
    Y = np.zeros((len(y), nb_classes))
    for i in range(len(y)):
        if y[i] != -1:
            Y[i, y[i]] = 1.
    return Y


def parse_alpha_to_seq(sequence):
    """replace nucleotides with values"""
    
    output = np.arange(len(sequence))
    for i in range(0, len(sequence)):
        snippet = sequence[i].upper()
        if snippet == 'A':
            output[i] = 0
        elif snippet == 'C':
            output[i] = 1
        elif snippet == 'T':
            output[i] = 2
        elif snippet == 'G':
            output[i] = 3
        elif snippet == 'N':
            output[i] = -1
        else:
            raise AssertionError("Cannot handle snippet: " + snippet)
    return output


def do_one_hot_encoding(sequence, seq_length, f=parse_alpha_to_seq):
    
    # make an empty matrix of zeros
    X = np.zeros((sequence.shape[0], seq_length, 4))

    # one-hot encode each sequence
    for idx in range(0, len(sequence)):
        p = parse_alpha_to_seq(sequence.iloc[idx])
        
        # make matrix into long form, where each row is a nucleotide position, each cell is the identity. 
        X[idx]= to_categorical(p, 4)
    return X

## reading fastas, preparing datas

In [15]:
# function to load sequences and enhancer activity
def prepare_input(df=None, fa_file=None, label_file=None, label_df=None):
    
    """Convert sequences to one-hot encoding matrix, 
        return sequences, matrix of sequences, and torch tensors x, y
    """
    
    #fa = str("Sequences_" + set + ".fa")
    
    if fa_file:
        df = makeDfFromFasta(fa_file)

    # get length of first sequence
    seq_len = len(df["seq"].iloc[0])

    # Convert sequence to one hot encoding matrix
    seq_matrix_A = do_one_hot_encoding(df["seq"], seq_len,f = parse_alpha_to_seq)
    # print(seq_matrix_A.shape)
    
    X = np.nan_to_num(seq_matrix_A) # Replace NaN with zero and infinity with large finite numbers
    X_reshaped = X.reshape((X.shape[0], X.shape[1], X.shape[2]))

    #
    if label_file:
        label_df = pd.read_csv(label_file, sep='\t')
    Y = label_df[label_df.columns[1:]]  # first column will be the sequence id

    print('df.seq.shape', df["seq"].shape, "seq.matrix.shape", seq_matrix_A.shape, 
          "X_reshaped", X_reshaped.shape, "Y.shape", Y.shape)
    
    return df["seq"], seq_matrix_A, X_reshaped, Y


# Prepping actual data

In [16]:
# from datamodule
# open the dataframe
df = pd.read_csv(DATA,
                 sep='\t', nrows=100)

# rename columns - in parts
seq_cols = ['seq_id', 'seq']
mean_value_cols = [f"mean_value{i+1}" for i in np.arange(0, OUT_CH)]
fold_rev_cols = ['fold_num', 'rev']
seq_cols += mean_value_cols+fold_rev_cols
df.columns = seq_cols[0:len(df.columns)]  # SF updated with second column


# get train, test, split
train = df[~df["fold_num"].isin([val_fold, test_fold])]
valid = df[df["fold_num"] == val_fold]
test = df[df["fold_num"] == test_fold]


label_df = train[["seq_id", "mean_value1"]]
X_train_sequence, X_train_seq_matrix, X_train, Y_train = prepare_input(train, None, None, label_df)
label_df_v = valid[["seq_id", "mean_value1"]]
X_valid_sequence, X_valid_seq_matrix, X_valid, Y_valid  = prepare_input(valid, None, None, label_df_v)
label_df_t = valid[["seq_id", "mean_value1"]]
X_test_sequence, X_test_seq_matrix, X_test, Y_test = prepare_input(test, None, None, label_df_t)


df.seq.shape (80,) seq.matrix.shape (80, 271, 4) X_reshaped (80, 271, 4) Y.shape (80, 1)
df.seq.shape (12,) seq.matrix.shape (12, 271, 4) X_reshaped (12, 271, 4) Y.shape (12, 1)
df.seq.shape (8,) seq.matrix.shape (8, 271, 4) X_reshaped (8, 271, 4) Y.shape (12, 1)


# loss function, optimizer, etc.

In [17]:
loss_fn = nn.CrossEntropyLoss()  # changed from MSE
f1_fn = f1_score.BinaryF1Score()

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=params['lr'], #  / 25,
                              weight_decay=params['weight_decay']
                             )

lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, # type: ignore
                                                max_lr=params['lr'],
                                                three_phase=False, 
                                                total_steps=10, #self.trainer.estimated_stepping_batches, # type: ignore
                                                pct_start=0.3,
                                                cycle_momentum=False)


# test

In [40]:
n_epochs = 10
batch_size = 10
X = torch.from_numpy(X_train).float()
y_pred = Y_train
for epoch in range(n_epochs):
    for i in range(0, len(X), batch_size):
        Xbatch = X[i:i+batch_size].transpose(1,2).contiguous() # transpose to [batchsize, n channels(4 nucleotides), seq_len (271)]
        y_pred = model(Xbatch)
        ybatch = y[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x1680 and 256x256)

In [44]:
print_sizes(model, Xbatch)

Sequential(
  (0): Conv1d(4, 256, kernel_size=(7,), stride=(1,))
  (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
) torch.Size([10, 256, 132])
Sequential(
  (0): Sequential(
    (0): Conv1d(256, 60, kernel_size=(3,), stride=(1,))
    (1): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (1): Sequential(
    (0): Conv1d(60, 60, kernel_size=(5,), stride=(1,))
    (1): BatchNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (2): Sequential(
    (0): Conv1d(60, 120, kernel_size=(3,), stride=(1,))
    (1): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): R

RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x1680 and 120x256)