### Numerical representation strategies demonstration

In [1]:
import pandas as pd
import sys
import os
sys.path.insert(0, "../")
from src.numerical_representation.physicochemical_properties import PhysicochemicalEncoder
from src.numerical_representation.embedding_representations import BioEmbeddings
from src.numerical_representation.one_hot_encoding import OneHotEncoder
from src.numerical_representation.fft_encoder import FFTTransform

  from .autonotebook import tqdm as notebook_tqdm


### Loading dataset

In [10]:
df_data = pd.read_csv("../results/dataset_fp.csv")
df_data.head(5)

Unnamed: 0,sequence,monomer_state
0,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0
1,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0
2,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0
3,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0
4,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0


### Numerical representation strategies explored

#### Physicochemical properties

In [12]:
dataset_encoder = pd.read_csv("../input_data_for_coding/cluster_encoders.csv")
path_input = "../results/encoders"
dataset_encoder.index = dataset_encoder["residue"]
dataset_encoder.head(5)

Unnamed: 0_level_0,residue,Group_0,Group_1,Group_2,Group_3,Group_4,Group_5,Group_6,Group_7
residue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,A,290.40675,71.850787,6.250299,44.65141,-107.792042,15.33599,56.16028,92.925289
R,R,172.577375,-6.96389,84.091653,200.152218,51.157141,172.36012,1.448105,-37.39311
N,N,-38.377385,-90.145475,-21.731374,-191.180531,73.940581,-259.135737,-54.69043,-77.746565
D,D,159.436015,-56.585499,-28.963699,-232.261465,55.369736,-216.012067,-29.383132,-7.421269
C,C,-4.241925,15.678516,-34.886819,-156.2126,-54.192823,-242.000209,10.074813,40.041394


In [13]:
for group in range(8):
    group_to_process = f"Group_{group}"
    command = f"mkdir -p {path_input}/{group_to_process}"
    print(command)
    os.system(command)

    print("Start codifications")

    name_export = f"{path_input}/{group_to_process}/coded_dataset.csv"

    physicochemical_encoder = PhysicochemicalEncoder(
        dataset=df_data,
        property_encoder = group_to_process,
        dataset_encoder=dataset_encoder,
        columns_to_ignore=["monomer_state"],
        name_column_seq="sequence"
    )

    physicochemical_encoder.run_process()
    physicochemical_encoder.df_data_encoded.to_csv(name_export, index=False)

mkdir -p ../results/encoders/Group_0
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/Group_1
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/Group_2
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/Group_3
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/Group_4
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/Group_5
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/Group_6
Start codifications
Encoding and Processing results
Creating dataset
Export dataset
mkdir -p ../results/encoders/Group_7
Start codifications
Encoding and Processing results
Creating dataset
Export dataset


#### FFT transform

In [15]:
for group in range(8):
    group_to_process = f"Group_{group}"

    print("Reading datasets")
    df_coded = pd.read_csv(f"{path_input}/{group_to_process}/coded_dataset.csv")

    command = f"mkdir -p {path_input}/{group_to_process}_FFT"
    print(command)
    os.system(command)

    print("Start codifications")

    name_export = f"{path_input}/{group_to_process}_FFT/coded_dataset.csv"

    print("Applying FFT")
    fft_transform = FFTTransform(
        dataset=df_coded,
        size_data=len(df_coded.columns)-1,
        columns_to_ignore=["monomer_state"],
    )

    response_coded = fft_transform.encoding_dataset()
    response_coded.to_csv(name_export, index=False)

Reading datasets
mkdir -p ../results/encoders/Group_0_FFT
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset
Reading datasets
mkdir -p ../results/encoders/Group_1_FFT
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset
Reading datasets
mkdir -p ../results/encoders/Group_2_FFT
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset
Reading datasets
mkdir -p ../results/encoders/Group_3_FFT
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset
Reading datasets
mkdir -p ../results/encoders/Group_4_FFT
Start codifications
Applying FFT
Removing columns data
Get near pow 2 value
Apply zero padding
Creating dataset
Export dataset
Reading datasets
mkdir -p ../results/encoders/Group_5_FFT
Start codificatio

#### Embedding through bio-embedding tool

In [17]:
bioembedding_instance = BioEmbeddings(
    dataset = df_data,
    seq_column = "sequence",
    is_reduced = True,
    device = "cuda",
    column_response = "monomer_state",
    path_export = "../results/encoders/"
)

bioembedding_instance.apply_bepler(name_export="dataset_fp")
bioembedding_instance.apply_onehot(name_export="dataset_fp") # One Hot
bioembedding_instance.apply_esm1b(name_export="dataset_fp")
bioembedding_instance.apply_esme(name_export="dataset_fp")