# Dataset

> A collection of datasets

In [1]:
#| default_exp dataset

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
from fastbook import *

In [4]:
#| export
class Data:
    """
    A class for fetching various datasets.
    """

    ANTIBIOTICS_URL = "https://github.com/sky1ove/tools/raw/main/dataset/antibiotics_2335.csv"
    G12D_URL = "https://github.com/sky1ove/tools/raw/main/dataset/KRASi_g12d.csv"
    G12D_IC50_URL = "https://github.com/sky1ove/tools/raw/main/dataset/dedup_IC50.csv"
    KSEQ_URL = "https://github.com/sky1ove/tools/raw/main/dataset/kras_seq.csv"

    def __init__(self):
        pass
    
    @staticmethod
    def get_antibiotics():
        """
        Fetches the deduplicated dataset from the cell paper: 
        A Deep Learning Approach to Antibiotic Discovery.
        """
        df = pd.read_csv(Data.ANTIBIOTICS_URL)
        return df
    
    @staticmethod
    def get_g12d():
        """
        Fetches the G12D dataset from the paper and patents.
        """
        df = pd.read_csv(Data.G12D_URL)
        return df
    
    @staticmethod
    def get_g12d_IC50():
        """
        Fetches the deduplicated IC50 G12D dataset from the paper and patents.
        """
        df = pd.read_csv(Data.G12D_IC50_URL)
        return df
    
    @staticmethod
    def get_kseq():
        """
        Fetches the sequence of KRAS and its mutations G12D and G12C.
        """
        df = pd.read_csv(Data.KSEQ_URL)
        return df

In [5]:
show_doc(Data.get_antibiotics)

---

[source](https://github.com/sky1ove/tools/blob/main/tools/dataset.py#L23){target="_blank" style="float:right; font-size:smaller"}

### Data.get_antibiotics

>      Data.get_antibiotics ()

Fetches the deduplicated dataset from the cell paper: 
A Deep Learning Approach to Antibiotic Discovery.

In [6]:
Data.get_antibiotics()

Unnamed: 0,name,SMILES,inhibition,activity
0,CEFPIRAMIDE,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(CSc4nnnn4C)CSC23)c2ccc(O)cc2)cn1,0.041572,1
1,GEMIFLOXACIN MESYLATE,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC1CN.CS(=O)(=O)O,0.041876,1
2,POLYMYXIN B SULFATE,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O)NC1CCNC(=O)C(C(C)O)NC(=O)C(CCN)NC(=O)C(CCN)NC(=O)C(CC(C)C)NC(=O)C(Cc2ccccc2)NC(=O)C(CCN)NC1=O)C(C)O.O=S(=O)(O)O,0.041916,1
3,PRAXADINE HYDROCHLORIDE,Cl.N=C(N)n1cccn1,0.041964,1
4,CHLORHEXIDINE DIHYDROCHLORIDE,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(=N)Nc1ccc(Cl)cc1,0.042295,1
...,...,...,...,...
2330,COLFORSIN,C=CC1(C)CC(=O)C2(O)C(C)(O1)C(OC(C)=O)C(O)C1C(C)(C)CCC(O)C12C,1.235350,0
2331,ANAZOLENE SODIUM,O=S(=O)([O-])c1cc(O)c2c(N=Nc3ccc(Nc4ccccc4)c4c(S(=O)(=O)[O-])cccc34)cc(S(=O)(=O)[O-])cc2c1.[Na+].[Na+].[Na+],1.251650,0
2332,DAPSONE,Nc1ccc(S(=O)(=O)c2ccc(N)cc2)cc1,1.273150,0
2333,EVANS BLUE,Cc1cc(-c2ccc(N=Nc3ccc4c(S(=O)(=O)[O-])cc(S(=O)(=O)[O-])c(N)c4c3O)c(C)c2)ccc1N=Nc1ccc2c(S(=O)(=O)[O-])cc(S(=O)(=O)[O-])c(N)c2c1O.[Na+].[Na+].[Na+].[Na+],2.263200,0


In [7]:
show_doc(Data.get_g12d)

---

[source](https://github.com/sky1ove/tools/blob/main/tools/dataset.py#L32){target="_blank" style="float:right; font-size:smaller"}

### Data.get_g12d

>      Data.get_g12d ()

Fetches the G12D dataset from the paper and patents.

In [8]:
Data.get_g12d()

Unnamed: 0,ID,SMILES,group,with_3F,racemic_trans,mixture_isomer,trans,Kd,IC50,erk_IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)-c1cc(O)cc2ccccc12,US,0,0,0,0,97.7,124.7,3159.1
1,US_2,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,US,1,0,0,0,2.4,2.7,721.4
2,US_3,Cn1ccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,US,0,0,0,0,8.3,9.5,10283.1
3,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,US,0,0,0,0,155.7,496.2,8530.0
4,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,US,0,0,0,0,294.8,722.9,8193.8
...,...,...,...,...,...,...,...,...,...,...
717,paper_34,FC1=C(C2=C(C=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,paper,0,0,0,0,,27.0,370.0
718,paper_35,FC1=C(C2=C(C(F)=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,paper,0,0,0,0,,7.0,97.0
719,paper_36,FC1=C(C2=C(C(C#C)=C(F)C=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,paper,0,0,0,0,,2.0,24.0
720,paper_37,FC1=C(C2=C(C(Cl)=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,paper,0,0,0,0,,2.0,63.0


In [12]:
show_doc(Data.get_g12d_IC50)

---

### Data.get_g12d_IC50

>      Data.get_g12d_IC50 ()

Fetches the deduplicated IC50 G12D dataset from the paper and patents.

In [13]:
Data.get_g12d_IC50()

Unnamed: 0,ID,SMILES,IC50
0,US_1,CN1CCC[C@H]1COc1nc(N2CC3CCC(C2)N3)c2cnc(cc2n1)-c1cc(O)cc2ccccc12,124.70
1,US_4,Oc1cc(-c2ncc3c(nc(OCCc4ccccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,496.20
2,US_5,Cn1nccc1COc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,722.90
3,US_6,Cc1cccnc1CCOc1nc(N2CC3CCC(C2)N3)c2cnc(c(F)c2n1)-c1cc(O)cc2ccccc12,434.10
4,US_7,Oc1cc(-c2ncc3c(nc(OCCc4ncccn4)nc3c2F)N2CC3CCC(C2)N3)c2ccccc2c1,1867.30
...,...,...,...
646,paper_21,FC1=C(C2=C(C=CC=C3)C3=CC(O)=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,1.25
647,paper_33,FC1=C(C2=C(C(C#C)=CC=C3)C3=CC=C2)N=CC4=C1N=C(OCC56N(CCC6)CCC5)N=C4N7C[C@@H](CC8)N[C@@H]8C7,2.05
648,646_117,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@]23CCCN3C[C@@H](C2)F)CN(CC1)C1=CC(=CC2=CC=CC(=C12)Cl)O,0.60
649,US_468,[C@H]12CN(C[C@H](CC1)N2)C=2C1=C(N=C(N2)OC[C@@]23CCCN3C[C@H](C2)F)C(=C(N=C1)C1=CC(=CC2=CC=CC(=C12)CC)O)F,1.40


In [9]:
show_doc(Data.get_kseq)

---

[source](https://github.com/sky1ove/tools/blob/main/tools/dataset.py#L40){target="_blank" style="float:right; font-size:smaller"}

### Data.get_kseq

>      Data.get_kseq ()

Fetches the sequence of KRAS and its mutations G12D and G12C.

In [10]:
Data.get_kseq()

Unnamed: 0,ID,WT_sequence,g12d_seq,g12c_seq
0,kras_human,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGCVKIKKCIIM,MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGCVKIKKCIIM,MTEYKLVVVGACGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGCVKIKKCIIM
1,kras_human_isoform2b,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEKMSKDGKKKKKKSKTKCVIM,MTEYKLVVVGADGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEKMSKDGKKKKKKSKTKCVIM,MTEYKLVVVGACGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEKMSKDGKKKKKKSKTKCVIM


In [11]:
#| hide
import nbdev; nbdev.nbdev_export()