In [1]:
import json
import pandas as pd
on_hold_targets = json.load(open("on-hold-targets.json"))
classified_entities = json.load(open("classified-entities.json"))
filtered_targets_1 = json.load(open("filtered-targets-1.json"))
analyzed_searches = json.load(open("analyzed-searches.json"))
classified_targets = json.load(open("classified-targets.json"))

all_targets = {}
for target0 in on_hold_targets["grouped"]["pdb_id"]["groups"]:        
    chains = target0["doclist"]["docs"]
    target = chains[0]  # most target info is stored with the chain
    pdb_code = target["pdb_id"]
    date = target.get("deposition_date")    
    if date is not None:
        date = date.split("T")[0]
        # TODO: expiry date if not HPUB (I have no example to work with)
    curr_target = {
        "pdb_code": pdb_code,
        "title": target.get("title"),
        "authors": target.get("entry_author_list"),
        "date": date,
        "status": target["status"]
    }
    # Unfortunately, Uniprot is not stored in the result...
    all_targets[pdb_code] = curr_target

prot_peptide_targets = {}
prot_peptide_chains = []

targets = {}
target_chains = []
target_templates = []

peptide_rejections = ("Target contains large peptide", "Small peptide")
for pdb_code, target in  filtered_targets_1.items():    
    assert pdb_code in all_targets, pdb_code
    rejection = target.get("rejection")
    prot_peptide = False
    if rejection is None:
        t = all_targets[pdb_code]
        t["classification"] = classified_targets[pdb_code]["classification"]
        t["rejection"] = classified_targets[pdb_code].get("rejection")
        targets[pdb_code] = t        
    else:
        all_targets["rejection"] = rejection
        if rejection in peptide_rejections:
            prot_peptide_targets[pdb_code] = all_targets[pdb_code]
            prot_peptide = True
    if rejection is not None and not prot_peptide:
        continue
    for entity in target:
        if not entity.startswith(pdb_code + "_"):
            continue
        chain = target[entity]
        templates, coverage = [], "?"
        if not prot_peptide and chain["classification"] == "protein":
            templates = analyzed_searches[pdb_code][entity].get("matches", [])
            coverage = classified_targets[pdb_code]["coverage"][entity]
        classification = chain["classification"]
        curr_chain = {
            "pdb_code": pdb_code,
            "entity": entity,
            "classification": classification,
            "sequence": chain["sequence"],            
            "uniprot": None,   # unfortunately, not stored...,
            "coverage": coverage,
            "templates": len(templates),
        }
        if prot_peptide:
            curr_chain.pop("templates")
            curr_chain.pop("coverage")
            prot_peptide_chains.append(curr_chain)
        else:
            target_chains.append(curr_chain)
            for template in templates:
                tmpl = json.loads(template)
                tmpl["target"] = pdb_code
                tmpl["target_entity"] = entity
                target_templates.append(tmpl)



# Protein-protein targets

In [2]:
targets = pd.DataFrame(targets.values())
targets.to_csv("targets.csv")
targets

Unnamed: 0,authors,classification,date,pdb_code,rejection,status,title
0,,rejected,2018-09-14,6hna,Unbound coverage is bad for one or more entities,HPUB,
1,"[Sato-Tomita, A, Park, S-Y, Shibayama, N]",rejected,2018-10-16,6iks,"Trivial templates: ['1bz1', '1ch4', '5ker']",HPUB,Carbonmonoxy human hemoglobin A at 140 K: Lase...
2,"[Sato-Tomita, A, Park, S-Y, Shibayama, N]",rejected,2018-10-16,6ikv,"Trivial templates: ['1bz1', '1ch4', '5ker']",HPUB,Carbonmonoxy human hemoglobin A at 140 K: Lase...
3,"[Sato-Tomita, A, Park, S-Y, Shibayama, N]",rejected,2018-10-17,6il5,"Trivial templates: ['1bz1', '1ch4', '5ker']",HPUB,Carbonmonoxy human hemoglobin A at 140 K: Lase...
4,"[Lin, Z, Yang, Z, Ji, Z, Zhang, M]",rejected,2019-02-27,6jjw,Unbound coverage is bad for one or more entities,HPUB,Crystal Structure of KIBRA and PTPN14 complex
5,"[Yokoyama, H, Mizutani, R, Noguchi, S, Hayashi...",rejected,2019-07-02,6kdh,Trivial template: 3ld8,HPUB,Antibody 64M-5 Fab including isoAsp in ligand-...
6,"[Yokoyama, H, Mizutani, R, Noguchi, S, Hayashi...",rejected,2019-07-02,6kdi,Trivial template: 3ld8,HPUB,Antibody 64M-5 Fab including isoAsp in complex...
7,"[Jialiang, W, Zhijun, W]",rejected,2019-07-03,6ke7,Unbound coverage is bad for one or more entities,HPUB,LovBC
8,"[Phillips, RS]",rejected,2018-09-27,6mls,Trivial template: 2tpl,HPUB,Citrobacter freundii tyrosine phenol-lyase com...
9,"[Phillips, RS]",rejected,2018-10-04,6mo3,Trivial template: 2tpl,HPUB,Citrobacter freundii tyrosine phenol-lyase com...


# Protein-protein target chains

In [3]:
target_chains = pd.DataFrame.from_dict(target_chains)
target_chains.to_csv("target_chains.csv")
target_chains

Unnamed: 0,classification,coverage,entity,pdb_code,sequence,templates,uniprot
0,protein,contiguous,6hna_1,6hna,MAHHHHHHVDDDDKDLFNKNKKLDADLLKTLDNLLKTLDNNQKQAL...,27,
1,protein,bad,6hna_2,6hna,AGIQNDSTGKCGPPPPIDNGDITSFPLSVYAPASSVEYQCQNLYQL...,25,
2,protein,complete,6iks_1,6iks,VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF...,292,
3,protein,complete,6iks_2,6iks,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,304,
4,protein,complete,6ikv_1,6ikv,VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF...,292,
5,protein,complete,6ikv_2,6ikv,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,304,
6,protein,complete,6il5_1,6il5,VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF...,292,
7,protein,complete,6il5_2,6il5,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,304,
8,protein,complete,6jjw_1,6jjw,GPGSEFELPLPEGWEEARDFDGKVYYIDHRNRTTSWIDPRDRYTKP...,57,
9,protein,bad,6jjw_2,6jjw,GPGSSHRHSAIIVPSYRPTPDYETVMRQMKRG,0,


# Templates for protein-protein target chains

In [4]:
target_templates = pd.DataFrame(target_templates)
target_templates.to_csv("target_templates.csv")
target_templates

Unnamed: 0,pdb,pdb_chain,query_region,seqid,target,target_entity,target_region
0,3zm7,3zm7_A,"[1, 25]",68.000000,6hna,6hna_1,"[1, 25]"
1,3qz0,3qz0_A,"[1, 37]",56.756757,6hna,6hna_1,"[1, 39]"
2,3h87,3h87_A,"[1, 31]",54.838710,6hna,6hna_1,"[1, 31]"
3,5vix,5vix_A,"[1, 70]",34.285714,6hna,6hna_1,"[1, 75]"
4,3ayr,3ayr_A,"[1, 53]",46.153846,6hna,6hna_1,"[1, 52]"
5,3ays,3ays_A,"[1, 53]",46.153846,6hna,6hna_1,"[1, 52]"
6,4cpg,4cpg_A,"[1, 33]",61.290323,6hna,6hna_1,"[1, 31]"
7,4lcd,4lcd_E,"[1, 70]",32.857143,6hna,6hna_1,"[1, 75]"
8,4lg2,4lg2_A,"[1, 21]",76.190476,6hna,6hna_1,"[1, 22]"
9,5h9c,5h9c_A,"[1, 28]",67.857143,6hna,6hna_1,"[1, 31]"


In [5]:
import qgrid
target_templates2 = qgrid.show_grid(target_templates)
target_templates2

# Protein-peptide targets

In [6]:
prot_peptide_targets = pd.DataFrame(prot_peptide_targets.values())
prot_peptide_targets.to_csv("prot_peptide_targets.csv")
prot_peptide_targets

Unnamed: 0,authors,date,pdb_code,status,title
0,"[Zhang, H, Wang, Z]",2018-11-23,6itm,HPUB,Crystal structure of FXR in complex with agoni...
1,"[Lin, Z, Yang, Z, Ji, Z, Zhang, M]",2019-02-27,6jjx,HPUB,Crystal Structure of KIBRA and Angiomotin complex
2,"[Lin, Z, Yang, Z, Ji, Z, Zhang, M]",2019-02-27,6jjy,HPUB,Crystal Structure of KIBRA and beta-Dystroglycan
3,"[Sandner, A, Heine, A, Klebe, G]",2019-05-13,6rot,HPUB,Thrombin in complex with MI2105
4,,2019-08-14,6sk7,HPUB,
5,"[Madej, M, Ranson, NA, White, JBR]",2019-08-19,6sli,HPUB,Structure of the RagAB peptide transporter
6,"[Madej, M, Ranson, NA, White, JBR]",2019-08-20,6slj,HPUB,Structure of the RagAB peptide transporter
7,"[Madej, M, Ranson, NA, White, JBR]",2019-08-20,6sln,HPUB,Structure of the RagAB peptide transporter
8,"[Ultsch, MH, Kirchhofer, D]",2019-08-19,6u2f,HPUB,Complex of PCSK9-fab 7G7 bound to compound 16


In [7]:
prot_peptide_chains = pd.DataFrame.from_dict(prot_peptide_chains)
prot_peptide_chains.to_csv("prot_peptide_chains.csv")
prot_peptide_chains

Unnamed: 0,classification,entity,pdb_code,sequence,uniprot
0,protein,6itm_1,6itm,MGHHHHHHGSTELTPDQQTLLHFIMDSYNKQRMPQEITNKILKEAF...,
1,large peptide,6itm_2,6itm,KDHQLLRYLLDKDE,
2,protein,6jjx_1,6jjx,GPGSEFELPLPEGWEEARDFDGKVYYIDHRNRTTSWIDPRDRYTKP...,
3,large peptide,6jjx_2,6jjx,GPGSGRTEGQLMRYQHPPEYGAARPA,
4,protein,6jjy_1,6jjy,GPGSEFELPLPEGWEEARDFDGKVYYIDHRNRTTSWIDPRDRYTKP...,
5,large peptide,6jjy_2,6jjy,GPGSRPKNMTPYRSPPPYVPP,
6,protein,6rot_1,6rot,TFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGR,
7,protein,6rot_2,6rot,IVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLL...,
8,large peptide,6rot_3,6rot,GDFEEIPEEXLQ,
9,protein,6sk7_1,6sk7,NPVENYIDSVLNEVLVVPNIQPSTSVSSHAAPALDAAETGHTSSVQ...,
