In [1]:
import pandas as pd
import numpy as np

In [2]:
def renombrar_columnas(df):
    return df.rename(columns={
        "acc": "solvent accessibility",
        "diso": "order/disorder prediction",
        "ss3": "secondary structure element (ss3)", 
        "ss8": "secondary structure element (ss8)",
        "tm2": "transMembrane topology (tm2)",
        "tm8": "transMembrane topology (tm8)"
    })

In [3]:
def calcular_proporciones(df, columns, unique_label):
    proportion_data = []

    for col in columns:
        for response in unique_label:
            df_protein = df[df['monomer_state'] == response]
            structure_counts = {}
            total_chars = 0

            for seq in df_protein[col].dropna():
                total_chars += len(seq)
                for char in seq:
                    if char in structure_counts:
                        structure_counts[char] += 1
                    else:
                        structure_counts[char] = 1

            for structure_type, count in structure_counts.items():
                proportion = count / total_chars

                proportion_data.append({
                    "label": response,
                    "structure_type": structure_type,
                    "column": col,
                    "proportion": proportion
                })

    return pd.DataFrame(proportion_data)


In [6]:
df_ss = pd.read_csv("../results/characterizing_dataset/secondary_structure_dataset_df.csv")
df_ss = renombrar_columnas(df_ss)
df_data = pd.read_csv("../results/dataset_fp.csv")
df_merge = pd.merge(df_ss, df_data, right_on="sequence", left_on="seq")

columns = [
    "secondary structure element (ss3)", "secondary structure element (ss8)", 'solvent accessibility', 
    "order/disorder prediction", "transMembrane topology (tm2)", "transMembrane topology (tm8)"
]
unique_label = df_merge['monomer_state'].unique()
proportion_data = calcular_proporciones(df_merge, columns, unique_label)

In [7]:
proportion_data.to_csv("../results/characterizing_dataset/percentage_ss.csv", index=False)