### Numerical representation strategies demonstration

In [1]:
import pandas as pd
import sys
import os
sys.path.insert(0, "../")
from src.numerical_representation.physicochemical_properties import PhysicochemicalEncoder
from src.numerical_representation.embedding_representations import BioEmbeddings
from src.numerical_representation.fft_encoder import FFTTransform

  from .autonotebook import tqdm as notebook_tqdm


### Loading dataset

In [2]:
df_data = pd.read_csv("../results/dataset_fp.csv")
df_data.head(5)

Unnamed: 0,sequence,monomer_state
0,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0
1,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0
2,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0
3,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0
4,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0


### Numerical representation strategies explored

#### Physicochemical properties

In [3]:
path_input = "../results/encoders"
dataset_encoder = pd.read_csv("../input_data_for_coding/aaindex_encoders.csv")

# ANDN920101 -> alpha helix
# ROBB760113 -> loop
# CRAJ730102 -> beta sheet
# ARGP820101 -> hydrophobicity

dataset_encoder = dataset_encoder[["residue", "ANDN920101", "ROBB760113", "ARGP820101", "CRAJ730102"]]
dataset_encoder.index = dataset_encoder["residue"]
dataset_encoder.head(5)

Unnamed: 0_level_0,residue,ANDN920101,ROBB760113,ARGP820101,CRAJ730102
residue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,A,4.35,-5.1,0.61,1.0
L,L,4.17,-5.4,1.53,1.53
R,R,4.38,2.6,0.6,0.74
K,K,4.36,1.0,1.15,1.18
N,N,4.75,4.7,0.06,0.75


In [5]:
propertys = ["ANDN920101", "ROBB760113", "ARGP820101", "CRAJ730102"]

for group in propertys:
    command = f"mkdir -p {path_input}/physicochemical_properties/{group}"
    print(command)
    os.system(command)

    print("Start codifications")
    name_export = f"{path_input}/physicochemical_properties/{group}/coded_dataset.csv"

    physicochemical_encoder = PhysicochemicalEncoder(
        dataset=df_data,
        property_encoder = group,
        dataset_encoder=dataset_encoder,
        columns_to_ignore=["monomer_state"],
        name_column_seq="sequence"
    )

    physicochemical_encoder.run_process()
    physicochemical_encoder.df_data_encoded.to_csv(name_export, index=False)

mkdir -p ../results/encoders/physicochemical_properties/ANDN920101
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/physicochemical_properties/ROBB760113
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/physicochemical_properties/ARGP820101
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/physicochemical_properties/CRAJ730102
Start codifications
Encoding and Processing results
Creating dataset
Export dataset


#### FFT transform

In [6]:
for group in propertys:
    print("Reading datasets")
    df_coded = pd.read_csv(f"{path_input}/physicochemical_properties/{group}/coded_dataset.csv")

    command = f"mkdir -p {path_input}/FFT/{group}"
    print(command)
    os.system(command)

    print("Start codifications")

    name_export = f"{path_input}/FFT/{group}/coded_dataset.csv"

    print("Applying FFT")
    fft_transform = FFTTransform(
        dataset=df_coded,
        size_data=len(df_coded.columns)-1,
        columns_to_ignore=["monomer_state"],
    )

    response_coded = fft_transform.encoding_dataset()
    response_coded.to_csv(name_export, index=False)

Reading datasets
mkdir -p ../results/encoders/FFT/ANDN920101
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset
Reading datasets
mkdir -p ../results/encoders/FFT/ROBB760113
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset
Reading datasets
mkdir -p ../results/encoders/FFT/ARGP820101
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset
Reading datasets
mkdir -p ../results/encoders/FFT/CRAJ730102
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset


#### Embedding through bio-embedding tool

In [14]:
bioembedding_instance = BioEmbeddings(
    dataset = df_data,
    seq_column = "sequence",
    is_reduced = True,
    device = "cuda",
    column_response = "monomer_state",
    path_export = "../results/encoders/"
)

bioembedding_instance.apply_bepler(name_export="coded_dataset")
bioembedding_instance.apply_onehot(name_export="coded_dataset") # One Hot
bioembedding_instance.apply_esm1b(name_export="coded_dataset")
bioembedding_instance.apply_esme(name_export="coded_dataset")