# APRs y Embeddings: Analisis de los datasets
---

# Drive, librerias, etc

In [None]:
pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from matplotlib.colors import ListedColormap
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
import h5py
import pickle
from Bio import SeqIO
import gzip
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix,  roc_curve, auc, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

Importamos el Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Generamos el dataset de AMYPRO con la secuencia completa de aminoacidos y las secuencias peptidicas particulares de cada región APR.

Ruta inicial de las bases de CPAD y amypro

In [None]:
ruta_apr_amyprot = '/content/drive/MyDrive/aggregation_itba/data/En uso/amypro_selected.tsv'

In [None]:
ruta_apr_cpad = '/content/drive/MyDrive/aggregation_itba/data/En uso/cpad.csv'

In [None]:
# Lee el archivo TSV en un DataFrame de pandas
#df_desorden = pd.read_csv(ruta_desorden_amyprot, sep='\t')
df_amypro = pd.read_csv(ruta_apr_amyprot, sep='\t')
df_cpad = pd.read_csv(ruta_apr_cpad)

In [None]:
# Establecer el ancho máximo de columna
pd.set_option('display.max_colwidth', 15)  # Puedes ajustar el valor según tu preferencia

In [None]:
unique_uniprot_ids_amypro = df_amypro["uniprot_id"].unique()
print(unique_uniprot_ids_amypro)

['Q08972' 'P52912' 'P40070' 'Q9PWC8' 'P60852' 'P54785' 'P07884' 'Q967R6'
 'P38996' 'Q04571' 'P25367' 'Q9Z4N4' 'P01275' 'P02846' 'P04002' 'P14922'
 'P09547' 'P02671' 'P06748' 'P0ABK7' 'P28307' 'Q03689' 'P23202' 'P05453'
 'Q9AD92' 'P40967' 'P01308' 'P02655' 'A1E959' 'P01034' 'Q15582' 'Q9UBU3'
 'P01144' 'J7GMN2' 'P00698' 'P15309' 'Q99972' 'P04637' 'Q1EN15' 'P01012'
 'P07320' 'P02489' 'P06396' 'P05067' 'P10997' 'P10636' 'P37840' 'P11686'
 'P61769' 'P61626' 'P16860' 'Q08431' 'P09681' 'P04156' 'P04279' 'P02766'
 'P02788' 'P02663' 'P0DJI8' 'P01258' 'P02647' 'P01236' 'P82042' 'Q9VSR3'
 'P11657' 'A0MVU0' 'P01145' 'Q13148' 'Q59L12' 'P0A734' 'P03275' 'P00441'
 'P04004' 'P59637' 'P01160' 'P61825' 'Q4ZHU1' 'Q71U36' 'P22303' 'P04080'
 'A8Z0V1' 'Q53643' 'P06850' 'P55090' 'Q969E3' 'P86706' 'Q01524' 'Q0HD51']


In [None]:
'''# Crear una cadena con los IDs separados por espacios
ids_string = ' '.join(unique_uniprot_ids_amypro)

# Especificar la ruta en tu Google Drive donde deseas guardar el archivo
file_path = '/content/drive/My Drive/uniprot_ids.txt'

# Guardar la cadena en un archivo .txt
with open(file_path, 'w') as file:
    file.write(ids_string)'''

"# Crear una cadena con los IDs separados por espacios\nids_string = ' '.join(unique_uniprot_ids_amypro)\n\n# Especificar la ruta en tu Google Drive donde deseas guardar el archivo\nfile_path = '/content/drive/My Drive/uniprot_ids.txt'\n\n# Guardar la cadena en un archivo .txt\nwith open(file_path, 'w') as file:\n    file.write(ids_string)"

In [None]:
# Especifica la ruta al archivo FASTA en tu Google Drive
fasta_file_path = '/content/drive/MyDrive/PFC/Proyecto/idmapping_2024_09_04.fasta.gz'  # Cambia la ruta si es necesario

secuences_amypro = []

# Leer las secuencias del archivo FASTA
with gzip.open(fasta_file_path, 'rt') as file:  # 'rt' es para leer en modo texto
    for record in SeqIO.parse(file, 'fasta'):
        secuences_amypro.append(str(record.seq))


In [None]:
secuences_amypro.insert(3, "obsolete")

In [None]:
unique_uniprot_ids_amypro

array(['Q08972', 'P52912', 'P40070', 'Q9PWC8', 'P60852', 'P54785',
       'P07884', 'Q967R6', 'P38996', 'Q04571', 'P25367', 'Q9Z4N4',
       'P01275', 'P02846', 'P04002', 'P14922', 'P09547', 'P02671',
       'P06748', 'P0ABK7', 'P28307', 'Q03689', 'P23202', 'P05453',
       'Q9AD92', 'P40967', 'P01308', 'P02655', 'A1E959', 'P01034',
       'Q15582', 'Q9UBU3', 'P01144', 'J7GMN2', 'P00698', 'P15309',
       'Q99972', 'P04637', 'Q1EN15', 'P01012', 'P07320', 'P02489',
       'P06396', 'P05067', 'P10997', 'P10636', 'P37840', 'P11686',
       'P61769', 'P61626', 'P16860', 'Q08431', 'P09681', 'P04156',
       'P04279', 'P02766', 'P02788', 'P02663', 'P0DJI8', 'P01258',
       'P02647', 'P01236', 'P82042', 'Q9VSR3', 'P11657', 'A0MVU0',
       'P01145', 'Q13148', 'Q59L12', 'P0A734', 'P03275', 'P00441',
       'P04004', 'P59637', 'P01160', 'P61825', 'Q4ZHU1', 'Q71U36',
       'P22303', 'P04080', 'A8Z0V1', 'Q53643', 'P06850', 'P55090',
       'Q969E3', 'P86706', 'Q01524', 'Q0HD51'], dtype=object)

In [None]:
# Asegúrate de que la longitud de unique_uniprot_ids_amypro y sequences sea la misma
if len(unique_uniprot_ids_amypro) == len(secuences_amypro):
    # Crear un DataFrame auxiliar con uniprot_id y secuencias
    df_aux = pd.DataFrame({
        'uniprot_id': unique_uniprot_ids_amypro,
        'sequence': secuences_amypro
    })
else:
    print("Error: Las longitudes de 'unique_uniprot_ids_amypro' y 'sequences' no coinciden.")


In [None]:
df_aux

Unnamed: 0,uniprot_id,sequence
0,Q08972,MPPKKFKDLNS...
1,P52912,MEDEMPKTLYV...
2,P40070,MLPLYLLTNAK...
3,Q9PWC8,obsolete
4,P60852,MAGGSATTWGY...
...,...,...
83,P55090,MRQRGRATLLV...
84,Q969E3,MLMPVHFLLLL...
85,P86706,TESYFVFSVGM
86,Q01524,MRTLTILTAVL...


In [None]:
# Función para obtener la secuencia de la región APR
def get_region_sequence(row, df_aux):
    uniprot_id = row['uniprot_id']
    start = row['start']
    end = row['end']

    # Verificar que start y end sean enteros
    if not isinstance(start, int) or not isinstance(end, int):
        return None

    # Obtener la secuencia completa para el uniprot_id
    sequence_entry = df_aux.loc[df_aux['uniprot_id'] == uniprot_id, 'sequence'].values
    if len(sequence_entry) == 0 or sequence_entry[0] == 'obsolete':
        return None

    sequence = sequence_entry[0]

    # Verificar que sequence sea una cadena
    if not isinstance(sequence, str):
        return None

    # Asegurarse de que los índices no excedan el tamaño de la secuencia
    if start < 0 or end > len(sequence):
        return None

    # Extraer la región específica
    return sequence[start-1:end]

In [None]:
def get_protein_sequence(uniprot_id, df_aux):
    sequence_entry = df_aux.loc[df_aux['uniprot_id'] == uniprot_id, 'sequence'].values
    if len(sequence_entry) == 0 or sequence_entry[0] == 'obsolete':
        return None
    return sequence_entry[0]

In [None]:
# Agregar el campo "region_sequence" al dataframe df_amypro
df_amypro['peptide'] = df_amypro.apply(lambda row: get_region_sequence(row, df_aux), axis=1)

# Agregar el campo "protein_sequence" al dataframe df_amypro
df_amypro['protein_sequence'] = df_amypro['uniprot_id'].apply(lambda uniprot_id: get_protein_sequence(uniprot_id, df_aux))

# Calcular la longitud de la región y comparar con el campo "length"
df_amypro['calculated_length'] = df_amypro['peptide'].apply(lambda x: len(x) if x is not None else None)
df_amypro['length_matches'] = df_amypro['length'] == df_amypro['calculated_length']

In [None]:
df_amypro

Unnamed: 0,id_in_source,uniprot_id,category,start,end,length,source,peptide,protein_sequence,calculated_length,length_matches
0,#AP00054,Q08972,functional ...,1,153,153,amypro,MPPKKFKDLNS...,MPPKKFKDLNS...,153.0,True
1,#AP00057,P52912,functional ...,290,386,97,amypro,MINPVQQQNQI...,MEDEMPKTLYV...,97.0,True
2,#AP00056,P40070,functional ...,91,187,97,amypro,QQINSNNNSNS...,MLPLYLLTNAK...,97.0,True
3,#AP00051,Q9PWC8,functional ...,95,100,6,amypro,,,,False
4,#AP00051,Q9PWC8,functional ...,189,195,7,amypro,,,,False
...,...,...,...,...,...,...,...,...,...,...,...
127,#AP00064,Q969E3,functional ...,1,38,38,amypro,MLMPVHFLLLL...,MLMPVHFLLLL...,38.0,True
128,#AP00065,P01275,functional ...,1,37,37,amypro,MKSIYFVAGLF...,MKSIYFVAGLF...,37.0,True
129,#AP00109,P86706,functional ...,1,11,11,amypro,TESYFVFSVGM,TESYFVFSVGM,11.0,True
130,#AP00124,Q01524,functional ...,1,32,32,amypro,MRTLTILTAVL...,MRTLTILTAVL...,32.0,True


In [None]:
df_amypro2 = df_amypro[['id_in_source', 'uniprot_id', 'category', 'start', 'end', 'length', 'peptide','protein_sequence']]

df_amypro2

Unnamed: 0,id_in_source,uniprot_id,category,start,end,length,peptide,protein_sequence
0,#AP00054,Q08972,functional ...,1,153,153,MPPKKFKDLNS...,MPPKKFKDLNS...
1,#AP00057,P52912,functional ...,290,386,97,MINPVQQQNQI...,MEDEMPKTLYV...
2,#AP00056,P40070,functional ...,91,187,97,QQINSNNNSNS...,MLPLYLLTNAK...
3,#AP00051,Q9PWC8,functional ...,95,100,6,,
4,#AP00051,Q9PWC8,functional ...,189,195,7,,
...,...,...,...,...,...,...,...,...
127,#AP00064,Q969E3,functional ...,1,38,38,MLMPVHFLLLL...,MLMPVHFLLLL...
128,#AP00065,P01275,functional ...,1,37,37,MKSIYFVAGLF...,MKSIYFVAGLF...
129,#AP00109,P86706,functional ...,1,11,11,TESYFVFSVGM,TESYFVFSVGM
130,#AP00124,Q01524,functional ...,1,32,32,MRTLTILTAVL...,MRTLTILTAVL...


Proteinas que solo estan en AMYPRO, no se comparten con CPAD

In [None]:
# Supongamos que tus DataFrames se llaman df1 y df2
uniprot_id_solo_amypro = df_amypro2[~df_amypro2['uniprot_id'].isin(df_cpad['uniprot_id'])]
uniprot_id_solo_amypro["uniprot_id"].unique()

array(['P52912', 'P40070', 'Q9PWC8', 'P54785', 'P07884', 'Q967R6',
       'P38996', 'Q04571', 'P25367', 'Q9Z4N4', 'P02846', 'P04002',
       'P09547', 'Q03689', 'A1E959', 'Q9UBU3', 'P01144', 'J7GMN2',
       'P15309', 'Q1EN15', 'P01012', 'P07320', 'P16860', 'P09681',
       'P04279', 'P02663', 'P0DJI8', 'P82042', 'Q9VSR3', 'P11657',
       'A0MVU0', 'P01145', 'Q59L12', 'P0A734', 'P03275', 'P04004',
       'P59637', 'P01160', 'P61825', 'Q4ZHU1', 'P04080', 'A8Z0V1',
       'Q53643', 'P06850', 'P55090', 'Q969E3', 'P86706', 'Q01524',
       'Q0HD51'], dtype=object)

Proteinas que se comparten entre CPAD y AMYPRO. De aca vamos a quedarnos con aquellos registros de amypro que no se solapan con NINGUNA región presente en CPAD

In [None]:
# Supongamos que tus DataFrames se llaman df1 y df2
uniprot_id_compartidos = df_amypro2[df_amypro2['uniprot_id'].isin(df_cpad['uniprot_id'])]
uniprot_id_compartidos["uniprot_id"].unique()

array(['Q08972', 'P60852', 'P01275', 'P14922', 'P02671', 'P06748',
       'P0ABK7', 'P28307', 'P23202', 'P05453', 'Q9AD92', 'P40967',
       'P01308', 'P02655', 'P01034', 'Q15582', 'P00698', 'Q99972',
       'P04637', 'P02489', 'P06396', 'P05067', 'P10997', 'P10636',
       'P37840', 'P11686', 'P61769', 'P61626', 'Q08431', 'P04156',
       'P02766', 'P02788', 'P01258', 'P02647', 'P01236', 'Q13148',
       'P00441', 'Q71U36', 'P22303'], dtype=object)

Tenemos el df con los uniprot_id que se comparten con CPAD

In [None]:
# Filtrar df_amypro2 para obtener solo los uniprot_id compartidos
df_amypro2_compartidos = df_amypro2[df_amypro2['uniprot_id'].isin(uniprot_id_compartidos["uniprot_id"].unique())]
df_amypro2_compartidos

Unnamed: 0,id_in_source,uniprot_id,category,start,end,length,peptide,protein_sequence
0,#AP00054,Q08972,functional ...,1,153,153,MPPKKFKDLNS...,MPPKKFKDLNS...
5,#AP00050,P60852,functional ...,251,256,6,SKEACQ,MAGGSATTWGY...
6,#AP00050,P60852,functional ...,345,351,7,IYENWLV,MAGGSATTWGY...
14,#AP00041,P01275,functional ...,1,10,10,MKSIYFVAGL,MKSIYFVAGLF...
17,#AP00044,P14922,functional ...,467,682,216,QQQHPAQQTPI...,MNPGGEQTIME...
...,...,...,...,...,...,...,...,...
110,#AP00084,P00441,pathogenic,147,153,7,CGVIGIA,MATKAVCVLKG...
116,#AP00103,Q71U36,pathogenic,353,370,18,VGINYQPPTVV...,MRECISIHVGQ...
117,#AP00103,Q71U36,pathogenic,395,401,7,FDLMYAK,MRECISIHVGQ...
118,#AP00102,P22303,pathogenic,555,568,14,LRAQACAFWNRFLP,MRPPQCLLHTP...


Le asignamos un id particular a cada registro de región APR dentro de este dataset

In [None]:
# Agregar una columna de ID ascendente en df_amypro2_compartidos
df_amypro2_compartidos['id'] = range(1, len(df_amypro2_compartidos) + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amypro2_compartidos['id'] = range(1, len(df_amypro2_compartidos) + 1)


# Generamos una lista de registros a NO tener en cuenta ya que se solapan con al menos 1 registro presente en CPAD

In [None]:
# Crear una lista para guardar los id_in_source de los registros solapados
ids_to_discard = []

# Iterar por cada uniprot_id compartido
for uniprot_id in df_amypro2_compartidos['uniprot_id'].unique():
    # Filtrar los registros correspondientes al uniprot_id en ambos DataFrames
    amypro_records = df_amypro2_compartidos[df_amypro2_compartidos['uniprot_id'] == uniprot_id]
    cpad_records = df_cpad[df_cpad['uniprot_id'] == uniprot_id]

    # Comparar cada registro de amypro con cada registro de cpad
    for _, amypro_row in amypro_records.iterrows():
        for _, cpad_row in cpad_records.iterrows():
            # Verificar si hay solapamiento
            if (amypro_row['start'] <= cpad_row['end']) and (cpad_row['start'] <= amypro_row['end']):
                # Si se solapan, guardar el id_in_source del registro de amypro
                ids_to_discard.append(amypro_row['id'])

In [None]:
# Convertir ids_to_discard en una serie de pandas y aplicar unique()
ids_to_discard_unique = pd.Series(ids_to_discard).unique()

# O puedes usar un set para eliminar duplicados
ids_to_discard_unique = list(set(ids_to_discard))

In [None]:
len(ids_to_discard_unique)

40

In [None]:
# Filtrar los registros en df_amypro2_compartidos para mantener solo los que no están en ids_to_discard
df_amypro2_no_overlap = df_amypro2_compartidos[~df_amypro2_compartidos['id'].isin(ids_to_discard)]

Obtenemos el dataset final con los registros APR que no se solapan con CPAD

In [None]:
df_amypro2_no_overlap

Unnamed: 0,id_in_source,uniprot_id,category,start,end,length,peptide,protein_sequence,id
5,#AP00050,P60852,functional ...,251,256,6,SKEACQ,MAGGSATTWGY...,2
6,#AP00050,P60852,functional ...,345,351,7,IYENWLV,MAGGSATTWGY...,3
14,#AP00041,P01275,functional ...,1,10,10,MKSIYFVAGL,MKSIYFVAGLF...,4
17,#AP00044,P14922,functional ...,467,682,216,QQQHPAQQTPI...,MNPGGEQTIME...,5
19,#AP00048,P02671,pathogenic,148,160,13,KVQHIQLLQKNVR,MFSMRIVCLVL...,6
21,#AP00037,P0ABK7,functional ...,24,45,22,YDLANSEYNFA...,MKNKLLFMMLT...,8
25,#AP00037,P0ABK7,functional ...,90,111,22,LAYIDQAGSAN...,MKNKLLFMMLT...,12
27,#AP00036,P28307,functional ...,68,90,23,LTITQHGGGNG...,MKLLKVAAIAA...,14
30,#AP00036,P28307,functional ...,91,112,22,DLTQRGFGNSA...,MKLLKVAAIAA...,17
34,#AP00039,Q9AD92,functional ...,13,29,17,GGLVLAGAGMA...,MLKKVVAAAAA...,20


In [None]:
np.sum(df_amypro2_no_overlap["length"])

707

In [None]:
uniprot_id_solo_amypro["uniprot_id"].unique()

array(['P52912', 'P40070', 'Q9PWC8', 'P54785', 'P07884', 'Q967R6',
       'P38996', 'Q04571', 'P25367', 'Q9Z4N4', 'P02846', 'P04002',
       'P09547', 'Q03689', 'A1E959', 'Q9UBU3', 'P01144', 'J7GMN2',
       'P15309', 'Q1EN15', 'P01012', 'P07320', 'P16860', 'P09681',
       'P04279', 'P02663', 'P0DJI8', 'P82042', 'Q9VSR3', 'P11657',
       'A0MVU0', 'P01145', 'Q59L12', 'P0A734', 'P03275', 'P04004',
       'P59637', 'P01160', 'P61825', 'Q4ZHU1', 'P04080', 'A8Z0V1',
       'Q53643', 'P06850', 'P55090', 'Q969E3', 'P86706', 'Q01524',
       'Q0HD51'], dtype=object)

# Generamos embeddings y etiquetas

## Funciones

In [None]:
def generateAPRlabels(df, id_field, sequence_field, start_field, end_field):
  # Crear un diccionario vacío para almacenar los resultados
  APR_dic = {}

  # Iterar sobre los valores únicos de uniprot_id
  for uniprot_id in df[id_field].unique():
    if(uniprot_id != "P25391"):
        # Filtrar el DataFrame para obtener solo las filas correspondientes a uniprot_id actual
        subconjunto_df = df[df[id_field] == uniprot_id]

        # Inicializar un vector de ceros con la longitud de la secuencia completa
        full_seq_len = len(subconjunto_df.iloc[0][sequence_field])
        vector = np.zeros(full_seq_len)
        #print(uniprot_id, full_seq_len)
        # Iterar sobre las filas del subconjunto DataFrame para llenar con unos las regiones APR
        for _, fila in subconjunto_df.iterrows():
            start_apr = fila[start_field]
            end_apr = fila[end_field]
            vector[start_apr - 1:end_apr] = 1  # Indices basados en cero

        # Almacenar el vector en el diccionario usando uniprot_id como clave
        APR_dic[uniprot_id] = vector

  APR_labels = np.concatenate(list(APR_dic.values()))

  #Chequeo
  """
  # Seleccionar aleatoriamente 3 uniprot_id y verificar los vectores generados
  random_uniprot_ids = np.random.choice(list(APR_dic.keys()), 3, replace=False)
  for uniprot_id in random_uniprot_ids:
      print(f"uniprot_id: {uniprot_id}")
      print(f"Vector generado: {APR_dic[uniprot_id]}")

      # Verificar las regiones APR en el DataFrame original
      original_regions = df[df[id_field] == uniprot_id][[start_field, end_field]]
      for _, row in original_regions.iterrows():
          start_apr = row[start_field] - 1
          end_apr = row[end_field]
          print(f"Región APR en DataFrame original: start={start_apr + 1}, end={end_apr}, valores en vector: {APR_dic[uniprot_id][start_apr:end_apr]}")
      print("\n")
  """


  return APR_labels

In [None]:
def generateAAVector(df, id_field, sequence_field):
  # Inicializar la variable para almacenar el total de aminoácidos
  total_aminoacidos = 0
  concatenated_sequence_list = []
  # Iterar sobre los valores únicos de uniprot_id
  for uniprot_id in df[id_field].unique():
    if(uniprot_id != "P25391"):
      # Filtrar el DataFrame para obtener solo las filas correspondientes a uniprot_id actual
      subconjunto_df = df[df[id_field] == uniprot_id]

      # Obtener la longitud de la secuencia completa y sumarla al total de aminoácidos
      total_aminoacidos += len(subconjunto_df.iloc[0][sequence_field])

      # Inicializar un vector de ceros con la longitud de la secuencia completa
      full_seq = subconjunto_df.iloc[0][sequence_field]
      for aa in full_seq:
        concatenated_sequence_list.append(aa)

  # Mostrar el total de aminoácidos
  print("Total de aminoácidos en todas las secuencias de proteínas:", total_aminoacidos)

  # Convertir la lista concatenada en un vector numpy
  aa_vector = np.array(concatenated_sequence_list)

  print("Longitud total de la secuencia concatenada:", len(aa_vector))
  print("Vector de aminoácidos:", aa_vector)

  return aa_vector

In [None]:
def generateEmbeddingsMatrix(df, id_field = "uniprot_id", embeddings_field = "embedding"):
  concatenated_embeddings_list = []
  for uniprot_id in df[id_field].unique():
    if(uniprot_id != "P25391"):
      # Filtrar el DataFrame para obtener solo las filas correspondientes a uniprot_id actual
      subconjunto_df = df[df[id_field] == uniprot_id]

      # Inicializar un vector de ceros con la longitud de la secuencia completa
      full_embedding = subconjunto_df.iloc[0][embeddings_field]
      for embedding in full_embedding:
        concatenated_embeddings_list.append(embedding)

      #print(uniprot_id, len(full_embedding))

  embeddings = np.array(concatenated_embeddings_list)
  return embeddings

In [None]:
def generateInputs(df, id_field = "uniprot_id", sequence_field = "protein_sequence", start_field = "start", end_field = "end", embeddings_field = "embedding"):

  APR_labels = generateAPRlabels(df, id_field, sequence_field, start_field, end_field)
  aa_vector = generateAAVector(df, id_field, sequence_field)
  embeddings =  generateEmbeddingsMatrix(df, id_field, embeddings_field)

  return embeddings, APR_labels, aa_vector

## Chequeamos los embeddings

In [None]:
alvaro_path = '/content/drive/MyDrive/aggregation_itba/data/En uso/alvaro_proteins.pickle'

# Cargar datos desde el archivo pickle
with open(alvaro_path, 'rb') as f:
    embeddings_alvaro_data = pickle.load(f)


# Convertir a DataFrame de pandas
df_desorden = pd.DataFrame(embeddings_alvaro_data)

In [None]:
df_desorden

Unnamed: 0,entry_id,uniprot_id,class_name,region_sequence,uniprot_start,uniprot_end,full_seq,region_embedding,full_seq_embedding
0,AP00001,P01236,pathogenic,LPICPGGAARC...,29,227,MNIKGSPWKGS...,[[-0.062324...,[[0.0736898...
1,AP00002,P01258,pathogenic,CGNLSTCMLGT...,85,116,MGFQKFSPFLA...,[[0.0792614...,[[0.1142445...
2,AP00003,P02647,pathogenic,DEPPQSPWDRV...,25,267,MKAAVLTLAVL...,[[0.1127525...,[[0.1154272...
3,AP00004,P02663,pathogenic,KNTMEHVSSSE...,16,222,MKFFIFTCLLA...,[[-0.090187...,[[0.0200585...
4,AP00005,P0DJI8,pathogenic,RSFFSFLGEAF...,19,122,MKLLTGLVFCS...,[[-0.153230...,[[-0.104943...
...,...,...,...,...,...,...,...,...,...
64,AP00116,P52750,functional ...,LPASAAKNAKL...,19,135,MRFIVSLLAFT...,[[-0.136994...,[[-0.217071...
65,AP00117,P32588,functional ...,SENNEEQHQQQ...,2,453,MSENNEEQHQQ...,[[0.0876804...,[[0.2642935...
66,AP00119,C4IN70,functional ...,GPAEKWKPTPA...,25,250,MKPTMALKPLV...,[[-0.006437...,[[0.2515512...
67,AP00120,C4IN69,functional ...,DSNNQALIDNA...,19,182,MTHSWLLLTVL...,[[-0.097117...,[[0.0920332...


In [None]:
df_desorden[df_desorden['full_seq_embedding'].isna()]['uniprot_id']

Unnamed: 0,uniprot_id
37,Q9PWC8


In [None]:
# Unir ambos DataFrames en el campo uniprot_id
df_solo_amypro_embeddings = pd.merge(uniprot_id_solo_amypro, df_desorden[['uniprot_id', 'full_seq', 'full_seq_embedding']], on='uniprot_id', how='left')

In [None]:
df_solo_amypro_embeddings

Unnamed: 0,id_in_source,uniprot_id,category,start,end,length,peptide,protein_sequence,full_seq,full_seq_embedding
0,#AP00057,P52912,functional ...,290,386,97,MINPVQQQNQI...,MEDEMPKTLYV...,,
1,#AP00056,P40070,functional ...,91,187,97,QQINSNNNSNS...,MLPLYLLTNAK...,,
2,#AP00051,Q9PWC8,functional ...,95,100,6,,,,
3,#AP00051,Q9PWC8,functional ...,189,195,7,,,,
4,#AP00053,P54785,functional ...,7,157,151,LQQQQQQRQQH...,MNADHHLQQQQ...,,
5,#AP00052,P07884,functional ...,199,207,9,FDTLFLWLY,MLKGPLKGCLN...,,
6,#AP00059,Q967R6,functional ...,1,128,128,MSQSPQTVDQA...,MSQSPQTVDQA...,,
7,#AP00058,P38996,functional ...,669,802,134,SQTPMDQQQLL...,MSDENHNSDVQ...,,
8,#AP00042,Q04571,functional ...,70,76,7,GCVVGVI,MQFTSVFTILA...,MQFTSVFTILA...,[[-0.009406...
9,#AP00043,P25367,functional ...,153,405,253,QGQGQGQGQGQ...,MDTDKLISEAE...,,


In [None]:
df_amypro_final = df_solo_amypro_embeddings[~df_solo_amypro_embeddings['full_seq_embedding'].isna()]

In [None]:
df_amypro_final

Unnamed: 0,id_in_source,uniprot_id,category,start,end,length,peptide,protein_sequence,full_seq,full_seq_embedding
8,#AP00042,Q04571,functional ...,70,76,7,GCVVGVI,MQFTSVFTILA...,MQFTSVFTILA...,[[-0.009406...
10,#AP00040,Q9Z4N4,functional ...,54,63,10,GALQTVGQGL,MREISQKDLNL...,MREISQKDLNL...,[[0.1346882...
12,#AP00047,P04002,functional ...,1,37,37,MALSLFTVGQL...,MALSLFTVGQL...,MALSLFTVGQL...,[[0.1911643...
14,#AP00034,Q03689,functional ...,218,289,72,KIDAIVGRNSA...,MSEPFGIVAGA...,MSEPFGIVAGA...,[[0.1067253...
15,#AP00021,A1E959,pathogenic,112,157,46,QLQTPPQTQPG...,MKIIILLGFLG...,MKIIILLGFLG...,[[-0.042085...
16,#AP00068,Q9UBU3,functional ...,1,23,23,MPSPGTVCSLL...,MPSPGTVCSLL...,MPSPGTVCSLL...,[[0.0756687...
18,#AP00099,J7GMN2,functional ...,8,15,8,ILNFFIFV,RIIKTLSILNF...,RIIKTLSILNF...,[[-0.067846...
19,#AP00091,P15309,pathogenic,53,88,36,FPTDPIKESSW...,MRAAPLLLARA...,MRAAPLLLARA...,[[0.2106348...
20,#AP00091,P15309,pathogenic,229,254,26,EDTMTKLRELS...,MRAAPLLLARA...,MRAAPLLLARA...,[[0.2106348...
21,#AP00092,Q1EN15,functional ...,1,24,24,MAFLKKSLFLV...,MAFLKKSLFLV...,MAFLKKSLFLV...,[[0.0290255...


In [None]:
embeddings, APR_labels, aa_vector = generateInputs(df_amypro_final, id_field = "uniprot_id", sequence_field = "protein_sequence", start_field = "start", end_field = "end", embeddings_field = "full_seq_embedding")

Total de aminoácidos en todas las secuencias de proteínas: 10851
Longitud total de la secuencia concatenada: 10851
Vector de aminoácidos: ['M' 'Q' 'F' ... 'C' 'C' 'L']


In [None]:
embeddings

array([[-0.00940635, -0.09101769,  0.43695492, ..., -0.17875056,
         0.13352579,  0.01750508],
       [-0.07351127,  0.13736919,  0.28034052, ...,  0.09855694,
         0.15798049,  0.1308269 ],
       [-0.2692629 ,  0.07626822,  0.3227064 , ..., -0.08330049,
         0.39634845, -0.03834004],
       ...,
       [-0.07631116,  0.06472504, -0.14422336, ...,  0.08338092,
         0.08645733, -0.28890306],
       [ 0.3678454 ,  0.0309215 ,  0.08960243, ...,  0.01821579,
        -0.06090633, -0.3824283 ],
       [ 0.16421203, -0.07185201,  0.19708742, ...,  0.06268523,
        -0.27407828, -0.31540796]], dtype=float32)

In [None]:
np.sum(APR_labels)

840.0

In [None]:
# Guardar las matrices
np.save('/content/drive/MyDrive/aggregation_itba/GitHub/embeddings_amypro.npy', embeddings)
np.save('/content/drive/MyDrive/aggregation_itba/GitHub/APR_labels_amypro.npy', APR_labels)

In [None]:
data_embeddings = np.load('/content/drive/MyDrive/aggregation_itba/GitHub/embeddings_amypro.npy')
data_labels = np.load('/content/drive/MyDrive/aggregation_itba/GitHub/APR_labels_amypro.npy')