Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data and remove unnecessary header rows
cyto_assay = pd.read_csv('data/AID_1851_datatable_all.csv', skipinitialspace=True, header=[0,4])

In [3]:
# Drop the bottom level of the multi-index
cyto_assay.columns = cyto_assay.columns.droplevel(1)

In [4]:
cyto_assay['Inhibition Observed'] = cyto_assay['Inhibition Observed'].map({True:1, False:0}).astype(int)

In [5]:
cyto_assay.shape

(85715, 44)

In [6]:
cyp2c19 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp2c19']
print(cyp2c19.shape)
# cyp2c19.to_csv()

(17143, 44)


In [7]:
cyp2c9 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp2c9']
print(cyp2c9.shape)

(17143, 44)


In [8]:
cyp1a2 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp1a2']
print(cyp1a2.shape)

(17143, 44)


In [9]:
cyp2d6 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp2d6']
print(cyp2d6.shape)

(17143, 44)


In [10]:
cyp3a4 =  cyto_assay[cyto_assay['Panel Name'] == 'p450-cyp3a4']
print(cyp3a4.shape)

(17143, 44)


In [11]:
cyp3a4

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Inhibition Observed,Approved Drug,Collection,...,Activity at 11.43 uM,Activity at 25.56 uM,Activity at 57.14 uM,Activity at 0.00164 uM,Activity at 0.00366 uM,Activity at 0.00818 uM,Compound QC,Panel ID,Panel Name,Panel Target
2,3,842238,6602638.0,Inactive,,,,1,Biodiverse,,...,0.4027,,-2.2935,,-1.3363,,QC'd by DPISMR,3,p450-cyp3a4,NP_059488.2
7,8,842250,644510.0,Active,,,,1,Biodiverse,,...,-82.0127,,-97.4718,,-21.1766,,QC'd by DPISMR,3,p450-cyp3a4,NP_059488.2
12,13,842319,1960010.0,Inconclusive,,,,1,Biodiverse,,...,-52.8656,,-52.7402,,2.5996,,QC'd by DPISMR,3,p450-cyp3a4,NP_059488.2
17,18,842408,644675.0,Active,,,,1,Biodiverse,,...,-47.8371,,-94.7622,,5.1962,,QC'd by DPISMR,3,p450-cyp3a4,NP_059488.2
22,23,842584,644851.0,Inconclusive,,,,1,Biodiverse,,...,-15.4369,,-42.1057,,-5.2084,,QC'd by DPISMR,3,p450-cyp3a4,NP_059488.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85692,85693,26751437,16758815.0,Inactive,,,,1,Exploratory,,...,-4.0397,,,,-2.2843,,QC'd by BUCMLD,3,p450-cyp3a4,NP_059488.2
85697,85698,26751438,16758816.0,Inactive,,,,0,Exploratory,,...,-21.2044,,,,-5.3018,,QC'd by BUCMLD,3,p450-cyp3a4,NP_059488.2
85702,85703,26751439,16758817.0,Inactive,,,,1,Exploratory,,...,-21.3967,,,,-2.2867,,QC'd by BUCMLD,3,p450-cyp3a4,NP_059488.2
85707,85708,26751440,16758818.0,Inactive,,,,0,Exploratory,,...,-18.1157,,,,-11.3235,,QC'd by BUCMLD,3,p450-cyp3a4,NP_059488.2


In [12]:
cyto_assay['PUBCHEM_SID']

0          842238
1          842238
2          842238
3          842238
4          842238
           ...   
85710    26751441
85711    26751441
85712    26751441
85713    26751441
85714    26751441
Name: PUBCHEM_SID, Length: 85715, dtype: int64

In [13]:
pubchemcid = cyto_assay['PUBCHEM_SID']

In [14]:
pubchemcid.to_csv('data/pubchemSID.csv', index=False)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
# From https://github.com/comp-pharm/ml-psychs/blob/main/src/data/make_dataset.py
def make_data():
    # Load the SID to Activity data
    datatable = pd.read_csv("data/AID_1851_datatable_all.csv", skiprows=[1, 2, 3, 4])
    sid_to_outcome = datatable[["PUBCHEM_SID", "PUBCHEM_ACTIVITY_OUTCOME"]]
    sid_to_outcome = sid_to_outcome.replace(
        {"PUBCHEM_ACTIVITY_OUTCOME": {"Inactive": 0, "Active": 1}}
    )

    # Load the SID to SMILES data
    smiles = pd.read_csv("data/SIDtoSMILES.txt", sep='\t', names=["PUBCHEM_SID", "SMILES"])

    # Create the SMILES to SID data
    data = pd.merge(sid_to_outcome, smiles, how="inner", on="PUBCHEM_SID")
    data.dropna(inplace=True)  # One SID has no SMILES string, drop it
    data.drop("PUBCHEM_SID", axis=1, inplace=True)  # Drop the SID column as we are done with it
    data = data[["SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]]  # Reorder

    data.to_csv("data/SMILES_to_Activity.csv", index=False)

In [20]:
def split_data():
    # Load the prepared data
    data = pd.read_csv("data/SMILES_to_Activity.csv")
    X_train, X_testval, y_train, y_testval = train_test_split(data["SMILES"], data["PUBCHEM_ACTIVITY_OUTCOME"],
                                                              stratify=data["PUBCHEM_ACTIVITY_OUTCOME"], test_size=0.2)
    X_val, X_test, y_val, y_test = train_test_split(X_testval, y_testval,
                                                    stratify=y_testval, test_size=0.5)

    train_data = pd.DataFrame({"SMILES": X_train, "PUBCHEM_ACTIVITY_OUTCOME": y_train})
    val_data = pd.DataFrame({"SMILES": X_val, "PUBCHEM_ACTIVITY_OUTCOME": y_val})
    test_data = pd.DataFrame({"SMILES": X_test, "PUBCHEM_ACTIVITY_OUTCOME": y_test})

    train_data.to_csv("data/train_SMILES_to_Activity.csv", index=False)
    val_data.to_csv("data/val_SMILES_to_Activity.csv", index=False)
    test_data.to_csv("data/test_SMILES_to_Activity.csv", index=False)


if __name__ == "__main__":
    make_data()
    split_data()

In [22]:
make_data()

In [23]:
split_data()

In [None]:
# Notes from Ian
# Once you get smiles to the true/false classification you're almost there. 
# From the smiles you generate features. Use rdkit for that. Either fingerprinting or something else.
# Then just throw features against your classification into a model of your choice and start testing accuracy!