In [1]:
import json
import pandas as pd
on_hold_targets = json.load(open("on-hold-targets.json"))
classified_entities = json.load(open("classified-entities.json"))
filtered_targets_1 = json.load(open("filtered-targets-1.json"))
analyzed_searches = json.load(open("analyzed-searches.json"))
classified_targets = json.load(open("classified-targets.json"))

all_targets = {}
for target0 in on_hold_targets["grouped"]["pdb_id"]["groups"]:        
    chains = target0["doclist"]["docs"]
    target = chains[0]  # most target info is stored with the chain
    pdb_code = target["pdb_id"]
    date = target.get("deposition_date")    
    if date is not None:
        date = date.split("T")[0]
        # TODO: expiry date if not HPUB (I have no example to work with)
    curr_target = {
        "pdb_code": pdb_code,
        "title": target.get("title"),
        "authors": target.get("entry_author_list"),
        "date": date,
        "status": target["status"]
    }
    # Unfortunately, Uniprot is not stored in the result...
    all_targets[pdb_code] = curr_target

prot_peptide_targets = {}
prot_peptide_chains = []

targets = {}
target_chains = []
target_templates = []

peptide_rejections = ("Target contains large peptide", "Small peptide")
for pdb_code, target in  filtered_targets_1.items():    
    assert pdb_code in all_targets, pdb_code
    rejection = target.get("rejection")
    prot_peptide = False
    if rejection is None:
        t = all_targets[pdb_code]
        t["classification"] = classified_targets[pdb_code]["classification"]
        t["rejection"] = classified_targets[pdb_code].get("rejection")
        targets[pdb_code] = t        
    else:
        all_targets["rejection"] = rejection
        if rejection in peptide_rejections:
            prot_peptide_targets[pdb_code] = all_targets[pdb_code]
            prot_peptide = True
    if rejection is not None and not prot_peptide:
        continue
    for entity in target:
        if not entity.startswith(pdb_code + "_"):
            continue
        chain = target[entity]
        templates, coverage = [], "?"
        if not prot_peptide and chain["classification"] == "protein":
            templates = analyzed_searches[pdb_code][entity].get("matches", [])
            coverage = classified_targets[pdb_code]["coverage"][entity]
        classification = chain["classification"]
        curr_chain = {
            "pdb_code": pdb_code,
            "entity": entity,
            "classification": classification,
            "sequence": chain["sequence"],            
            "uniprot": None,   # unfortunately, not stored...,
            "coverage": coverage,
            "templates": len(templates),
        }
        if prot_peptide:
            curr_chain.pop("templates")
            curr_chain.pop("coverage")
            prot_peptide_chains.append(curr_chain)
        else:
            target_chains.append(curr_chain)
            for template in templates:
                tmpl = json.loads(template)
                tmpl["target"] = pdb_code
                tmpl["target_entity"] = entity
                target_templates.append(tmpl)



KeyError: '6kf1'

# Protein-protein targets

In [None]:
targets = pd.DataFrame(targets.values())
targets.to_csv("targets.csv")
targets

# Protein-protein target chains

In [2]:
target_chains = pd.DataFrame.from_dict(target_chains)
target_chains.to_csv("target_chains.csv")
target_chains

Unnamed: 0,classification,coverage,entity,pdb_code,sequence,templates,uniprot
0,protein,contiguous,6hna_1,6hna,MAHHHHHHVDDDDKDLFNKNKKLDADLLKTLDNLLKTLDNNQKQAL...,27,
1,protein,bad,6hna_2,6hna,AGIQNDSTGKCGPPPPIDNGDITSFPLSVYAPASSVEYQCQNLYQL...,25,
2,protein,complete,6iks_1,6iks,VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF...,292,
3,protein,complete,6iks_2,6iks,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,304,
4,protein,complete,6ikv_1,6ikv,VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF...,292,
5,protein,complete,6ikv_2,6ikv,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,304,
6,protein,complete,6il5_1,6il5,VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF...,292,
7,protein,complete,6il5_2,6il5,VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFG...,304,
8,protein,complete,6kdh_1,6kdh,DVLMTQTPLSLPVSLGDQASISCRSSQNIVHSXGYTYLEWYLQKPG...,2678,
9,protein,complete,6kdh_2,6kdh,EVQLQQSGTVLARPGASVKMSCKASGYTFTNYWMHWIKQRPGQGLE...,2969,


# Templates for protein-protein target chains

In [3]:
target_templates = pd.DataFrame(target_templates)
target_templates.to_csv("target_templates.csv")
target_templates

Unnamed: 0,pdb,pdb_chain,query_region,seqid,target,target_entity,target_region
0,3zm7,3zm7_A,"[1, 25]",68.000000,6hna,6hna_1,"[1, 25]"
1,3qz0,3qz0_A,"[1, 37]",56.756757,6hna,6hna_1,"[1, 39]"
2,3h87,3h87_A,"[1, 31]",54.838710,6hna,6hna_1,"[1, 31]"
3,5vix,5vix_A,"[1, 70]",34.285714,6hna,6hna_1,"[1, 75]"
4,3ayr,3ayr_A,"[1, 53]",46.153846,6hna,6hna_1,"[1, 52]"
5,3ays,3ays_A,"[1, 53]",46.153846,6hna,6hna_1,"[1, 52]"
6,4cpg,4cpg_A,"[1, 33]",61.290323,6hna,6hna_1,"[1, 31]"
7,4lcd,4lcd_E,"[1, 70]",32.857143,6hna,6hna_1,"[1, 75]"
8,4lg2,4lg2_A,"[1, 21]",76.190476,6hna,6hna_1,"[1, 22]"
9,5h9c,5h9c_A,"[1, 28]",67.857143,6hna,6hna_1,"[1, 31]"


In [4]:
import qgrid
target_templates2 = qgrid.show_grid(target_templates)
target_templates2

# Protein-peptide targets

In [5]:
prot_peptide_targets = pd.DataFrame(prot_peptide_targets.values())
prot_peptide_targets.to_csv("prot_peptide_targets.csv")
prot_peptide_targets

Unnamed: 0,authors,date,pdb_code,status,title
0,"[Zhang, H, Wang, Z]",2018-11-23,6itm,HPUB,Crystal structure of FXR in complex with agoni...


In [6]:
prot_peptide_chains = pd.DataFrame.from_dict(prot_peptide_chains)
prot_peptide_chains.to_csv("prot_peptide_chains.csv")
prot_peptide_chains

Unnamed: 0,classification,entity,pdb_code,sequence,uniprot
0,protein,6itm_1,6itm,MGHHHHHHGSTELTPDQQTLLHFIMDSYNKQRMPQEITNKILKEAF...,
1,large peptide,6itm_2,6itm,KDHQLLRYLLDKDE,
