## **Cargamos el archivo con las secuencias genómicas en un dataFrame de pandas**

In [2]:
import pandas as pd
df = pd.read_parquet('drive/MyDrive/TFM/genomic_sequences.parquet')

In [3]:
df.columns

Index(['gene_id', 'sequence', 'chromosome', 'pos_min', 'pos_max'], dtype='object')

## **Instalamos varias librerías para tener el versionado adecuado para DNABERT-S**

In [None]:
!pip install einops
!pip uninstall transformers -y
!pip install transformers==4.27
!pip install peft
!pip install omegaconf
!pip install evaluate
!pip install accelerate
!pip install textaugment

!pip install triton==2.0.0.dev20221202

## **Para cargar el modelo de DNABERT-S**

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-S", trust_remote_code=True)
model = AutoModel.from_pretrained("zhihan1996/DNABERT-S", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/268 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.


configuration_bert.py:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


bert_layers.py:   0%|          | 0.00/40.8k [00:00<?, ?B/s]

bert_padding.py:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

flash_attn_triton.py:   0%|          | 0.00/42.7k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

In [None]:
torch.cuda.is_available()

In [None]:
dna = "ACGTAGCATCGGATCTATCTATCGACACTTGGTTATCGATCTACGAGCATCTCGTTAGC"
inputs = tokenizer(dna, return_tensors = 'pt')["input_ids"]

# Check if CUDA is available and set the device
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # This sets the device to the first GPU
    inputs = inputs.to(device)  # Move inputs to CUDA
    model = model.to(device)  # Move the model to CUDA
else:
    print("CUDA is not available. Running on CPU instead.")

# Now, running the model should use CUDA if available
hidden_states = model(inputs)[0]

# Since hidden_states is already on CUDA, operations here will be on CUDA as well
embedding_mean = torch.mean(hidden_states[0], dim=0)

# If you need to move the embedding_mean back to CPU for further processing not shown here, you can do:
# embedding_mean_cpu = embedding_mean.cpu()

# Printing shape should work irrespective of CPU or CUDA
print(embedding_mean.shape)  # expect to be 768

In [None]:
embedding_mean

tensor([-2.0554e-01,  4.0855e-01,  4.4909e-01, -1.4615e-01, -2.7086e-02,
         7.5123e-02,  5.3581e-02, -1.3794e-01,  3.2083e-02, -5.8664e-03,
        -6.7087e-02, -1.4298e-02, -9.1565e-02, -1.0859e-01, -7.9353e-02,
        -9.8941e-02, -1.3011e-01,  9.4138e-02, -1.8121e-01, -6.2716e-02,
         5.5823e-02,  1.9506e-01,  2.6716e-01,  1.0631e-02, -2.4274e-01,
        -4.9960e-02, -1.5599e-01, -4.3167e-01,  1.2520e-02,  1.7133e-01,
        -2.7432e-01,  2.1653e-01,  7.7834e-03, -1.3915e-01,  6.2556e-02,
        -1.4707e-01, -2.1774e-01, -1.0778e-01,  1.0720e-01, -1.0582e-01,
        -1.1520e-01, -1.3581e-01, -4.2992e-02,  4.0663e-02, -1.2506e-01,
         2.1287e-01, -1.8168e-01,  1.7185e-02,  2.8777e-01,  7.2997e-02,
         3.1417e-01,  2.3020e-02, -2.5531e-01, -8.8146e-02, -9.8629e-02,
        -1.1756e-01, -1.8990e-01, -4.8138e-02, -1.4833e-01, -1.3213e-01,
         1.2290e-01, -1.9211e-01,  1.5523e-01,  4.1268e-01,  2.1663e-01,
         1.0780e-01,  1.2220e-01, -1.0195e-01, -5.0

In [None]:
device = torch.device("cuda:0")
model = model.to(device)

In [None]:
# Importamos la librería Path que permite saber si una ruta existe
# from pathlib import Path

# Importamos la librería Garbage Collector para borrado de memoria en la sesión a través de las iteraciones y que la sesión no termine
import gc

# Número de genes para los que estamos sacando los embeddings
longitud = df.shape[0]


# Bucle que itera sobre los distintos genes generando y guardando archivos con los embeddings
for i in range(7017,7020):

  # Identificador del gen i-ésimo
  key = df.iloc[i]['gene_id']

  # ruta = Path('drive/MyDrive/TFM/Code/embeddings/DNABERT-S/'+str(key)+'.pt')
  # existe = ruta.exists()
  #if not existe:

  value = df.iloc[i]['sequence']  # secuencia del gen
  inputs = tokenizer(value, return_tensors = 'pt')["input_ids"]  # tokenización de la secuencia para poder introducirlo al modelo
  inputs = inputs.to(device)
  cont = inputs.shape[1]  # longitud de la tokenización

  num_chunks = (cont + 511) // 512  # +511 y no 512 para redondear hacia arriba
  print(i, key, num_chunks)
  total_chunks = torch.chunk(inputs, num_chunks, dim=1)  
  # distribuimos los valores de la tokenización para que no superen la longitud 512, que es el máximo que toma el modelo


  # Creamos agrupaciones de los embeddings para que se guarden en grupos de 10 en 10 e ir tomando la media de los embeddings
  grouped_chunks = []
  for idx in range(0, len(total_chunks), 10):
    remaining_elements = len(total_chunks) - idx
    if 10 < remaining_elements < 20:
        next_chunk_size = remaining_elements // 2

        # Esto lo he hecho para casos en los que tengamos casos con número de chunks como 12 o 21, donde esos chunks que no entran en decenas tendrían mucho más peso y
        #  de esta manera se evitan estos casos, partiendo en esos casos tomando grupos de 6-6 y 6-5 (respectivamente)
        grouped_chunks.append(total_chunks[idx:idx+next_chunk_size])
        grouped_chunks.append(total_chunks[idx+next_chunk_size:])
        break
    else:
        grouped_chunks.append(total_chunks[idx:idx+10])

  # Al borrar el acceso a la variable se permite su borrado en memoria
  del total_chunks
  gc.collect()

  # Iteramos por cada bloque de chunks generando los archivos, los genes que tengan varios archivos tendremos que promediarlos una vez tengamos todos los archivos
  for j, chunks in enumerate(grouped_chunks):
    hidden_states = []

    # Aplicamos el modelo a cada uno de los chunks de las listas de chunks
    for chunk in chunks:
      hidden_states += [model(chunk)[0]] # [1, sequence_length, 768]
      gc.collect()

    # Creamos el embedding a partir de la media de los embeddings de la lista de chunks
    embedding_mean = []
    for hidden_state in hidden_states:
      embedding_mean += [torch.mean(hidden_state[0], dim=0)]
      gc.collect()
    stacked_tensors = torch.stack(embedding_mean)
    embedding_mean = torch.mean(stacked_tensors, dim=0)

    # Guardamos el archivo correspondiente
    key = df.iloc[i]['gene_id']
    torch.save(embedding_mean, 'drive/MyDrive/TFM/Code/embeddings/temp/'+str(key)+'_'+str(j)+'.pt')
    print(i, key, j, 'stored')

    # Al borrar el acceso a las variables se permite su borrado en memoria
    del hidden_states
    del embedding_mean
    del stacked_tensors

    gc.collect()

7017 10393 40
7017 10393 0 stored
7017 10393 1 stored
7017 10393 2 stored
7017 10393 3 stored
7018 10394 2
7018 10394 0 stored
7019 10395 196
7019 10395 0 stored
7019 10395 1 stored
7019 10395 2 stored
7019 10395 3 stored
7019 10395 4 stored
7019 10395 5 stored
7019 10395 6 stored
7019 10395 7 stored
7019 10395 8 stored
7019 10395 9 stored
7019 10395 10 stored
7019 10395 11 stored
7019 10395 12 stored
7019 10395 13 stored
7019 10395 14 stored
7019 10395 15 stored
7019 10395 16 stored
7019 10395 17 stored
7019 10395 18 stored
7019 10395 19 stored


In [None]:
from pathlib import Path
str_path = 'drive/MyDrive/TFM/Code/embeddings/DNABERT-S/10393_3.pt'
file_path = Path(str_path)
file_path.exists()

In [None]:
tensor = torch.load(str_path, map_location=torch.device('cpu'))

In [None]:
tensor

tensor([-1.2319e-01,  1.2367e-01,  6.5402e-02, -1.0106e-01,  6.1205e-02,
        -2.0445e-01, -8.3707e-02,  1.4638e-01,  6.0772e-02,  4.8595e-02,
        -1.3998e-01, -7.2560e-02, -3.0277e-02, -2.8970e-02, -2.0980e-02,
         5.7754e-02, -7.9941e-02,  5.8035e-02, -1.1116e-01, -1.3570e-01,
         4.7770e-02, -8.6519e-02,  5.1719e-02, -1.3287e-01,  8.2450e-02,
        -3.5475e-03,  3.4144e-01,  4.6602e-02, -2.0118e-01, -9.2102e-03,
         6.7845e-02, -4.1314e-02, -1.5088e-01,  1.4819e-01, -4.5138e-02,
        -2.8631e-01, -3.9554e-02, -8.0464e-02,  1.2799e-01,  8.0614e-02,
        -1.5400e-01,  1.4917e-01,  4.6156e-02,  1.4938e-02,  7.7285e-02,
         1.2317e-01,  4.8708e-02,  7.3432e-02,  2.3081e-01,  1.6440e-01,
         1.6313e-02,  5.0843e-03,  4.5999e-02, -5.5917e-02,  8.0311e-02,
         1.5288e-01,  7.5230e-02, -1.4070e-01, -5.3667e-02,  7.4343e-02,
        -1.1786e-01, -5.4768e-02, -2.7766e-01, -4.1068e-01,  5.1069e-02,
         1.3759e-01,  1.5371e-02, -7.2793e-02, -1.3

In [None]:
longitud = df.shape[0]

for j in range(7000, longitud):
  key = df.iloc[j]['gene_id']
  str_path = f'drive/MyDrive/TFM/Code/embeddings/DNABERT-S-mean/{key}.pt'
  file_path = Path(str_path)
  b = file_path.exists()
  if b:
    print(j, key, 'stored')

7000 10368 stored
7001 10369 stored
7002 10370 stored
7003 10371 stored
7004 10376 stored
7005 10379 stored
7006 10380 stored
7007 10381 stored
7008 10382 stored
7009 10383 stored
7010 10384 stored
7011 10385 stored
7012 10388 stored
7013 10389 stored
7014 10390 stored
7015 10391 stored
7016 10392 stored
