In [41]:
import pandas as pd

In [42]:
df = pd.read_csv("../input/osfp-full-data-set.csv")

In [43]:
df = df.rename(columns={"AminoAcidSequence": "sequence",
                        "OligomericState": "monomer_state"})

#### Change response to binary

In [44]:
df.loc[df["monomer_state"].str.contains("Dimer", case=False, na=False), "monomer_state"] = 0
df.loc[df["monomer_state"].str.contains("Tetramer", case=False, na=False), "monomer_state"] = 0

df.loc[df["monomer_state"].str.contains("Monomer", case=False, na=False), "monomer_state"] = 1

#### Verify that the residues are canonical

In [45]:
def is_aa_sequence(sequence):
    alphabet = set("ACDEFGHIKLMNPQRSTVWY")
    sequence = sequence.strip().upper()
    is_canon = True
    for res in set(sequence):
        if res not in alphabet:
            is_canon = False
    return is_canon

In [46]:
df["sequence"][0]

'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFAYGVQCFSRYPDHMKQ\r\nHDFFKSAMPEGYVQERTIFYKDDGNYKSRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKMEYNYNSHNVYIMADKQKNG\r\nIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMILLEFVTAAGITHGMDELYK'

In [47]:
df["sequence"] = df["sequence"].str.replace(r"[\r\n]", "", regex=True)

In [48]:
df["is_canon"] = df["sequence"].apply(is_aa_sequence)

In [49]:
df["is_canon"].value_counts()

True    409
Name: is_canon, dtype: int64

#### Balance of the response

In [50]:
df["monomer_state"].value_counts()

1    207
0    202
Name: monomer_state, dtype: int64

In [51]:
df = df[["sequence", "monomer_state"]]

#### Export dataset

In [52]:
df.to_csv("../results/dataset_fp.csv", index=False)