# Convert LIRICAL's 384 Phenopackets to GA4GH Version 2 Format

In [21]:
import phenopackets as PPKt
import json
import os

### Get all 384 json files

In [2]:
from os import listdir
from os.path import isfile, join
v1dir = "v1phenopackets"
jsonfiles = [f for f in listdir(v1dir) if isfile(join(v1dir, f)) and f.endswith("json")]
print(f"Extracted total of {len(jsonfiles)} V1 Phenopacket V1 files")

Extracted total of 384 V1 Phenopacket V1 files


### Create JSON objects from JSON files

In [15]:
def get_json_object(fname, dirname):
    fpath = join(dirname, fname)
    if not isfile(fpath):
        raise FileNotFoundError(f"Could not find {fpath}")
    with open(fpath, "r") as f:
        json_string = f.read()
        json_dict = json.loads(json_string)
        return json_dict

In [25]:
phenopacketfile = "input/Naz_Villalba-2016-NLRP3-proband.json"
if not os.path.isfile(phenopacketfile):
    raise FileNotFoundError(f"Cound not find {phenopacketfile}")

In [31]:
json1 = jsonfiles[0]
json_dict = get_json_object(json1, v1dir)
print(json_dict)

{'id': 'PMID:27435956-Naz_Villalba-2016-NLRP3-proband', 'subject': {'id': 'proband', 'ageAtCollection': {'age': 'P5Y'}, 'sex': 'FEMALE', 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}}, 'phenotypicFeatures': [{'type': {'id': 'HP:0011227', 'label': 'Elevated C-reactive protein level'}, 'evidence': [{'evidenceCode': {'id': 'ECO:0000033', 'label': 'author statement supported by traceable reference'}, 'reference': {'id': 'PMID:27435956', 'description': 'Muckle-Wells Syndrome: A Case Report with an NLRP3 T348M Mutation'}}]}, {'type': {'id': 'HP:0000509', 'label': 'Conjunctivitis'}, 'negated': True, 'evidence': [{'evidenceCode': {'id': 'ECO:0000033', 'label': 'author statement supported by traceable reference'}, 'reference': {'id': 'PMID:27435956', 'description': 'Muckle-Wells Syndrome: A Case Report with an NLRP3 T348M Mutation'}}]}, {'type': {'id': 'HP:0002516', 'label': 'Increased intracranial pressure'}, 'evidence': [{'evidenceCode': {'id': 'ECO:0000033', 'label': 'author 

In [32]:
for k, v in json_dict.items():
    print(k, type(v))

id <class 'str'>
subject <class 'dict'>
phenotypicFeatures <class 'list'>
genes <class 'list'>
variants <class 'list'>
diseases <class 'list'>
metaData <class 'dict'>


In [33]:
def get_id(json_dict):
    if 'id' not in json_dict:
        raise ValueError("Could not extract id field")
    else:
        return json_dict.get('id')

In [42]:
def get_subject(json_dict):
    if 'subject' not in json_dict:
        raise ValueError("Could not extract id field")
    subject = json_dict.get('subject')
    proband_id = subject.get('id', None)
    ageAtCollection = subject.get('ageAtCollection', None)
    if ageAtCollection is not None:
        iso_age = ageAtCollection.get('age', None)
    else:
        iso_age = None
    sex = subject.get('sex', 'UNKNOWN_SEX')
    return proband_id, iso_age, sex
    

 'evidence': [{'evidenceCode': {'id': 'ECO:0000033', 'label': 'author statement supported by traceable reference'}, 'reference': {'id': 'PMID:27435956', 'description': 'Muckle-Wells Syndrome: A Case Report with an NLRP3 T348M Mutation'}}]}, {'type': {'id': 'HP:0002315', 'label': 'Headache'},

In [85]:
def get_phenotypic_features_list(json_dict):
    if 'phenotypicFeatures' not in json_dict:
        raise ValueError("Could not extract phenotypicFeatures field")
    pfeature_list = []
    for pfeat in json_dict.get('phenotypicFeatures'):
        pfeature = PPKt.PhenotypicFeature()
        ontology_term = pfeat.get('type')
        ontology_id = ontology_term.get('id')
        ontology_label = ontology_term.get('label')
        hpo_term = PPKt.OntologyClass()
        hpo_term.id = ontology_id
        hpo_term.label = ontology_label
        pfeature.type.CopyFrom(hpo_term)
        if 'excluded' in pfeat:
            pfeature.excluded = pfeat.excluded
        evidence_list = pfeat.get('evidence')
        # The 384 v1 phenopackets always have exactly one evidence element
        # do not check here, things will crash if we were wrong
        evidence = evidence_list[0]
        evidence_code = evidence.get('evidenceCode')
        evidence_term = PPKt.OntologyClass()
        evidence_term.id = evidence_code.get('id')
        evidence_term.label = evidence_code.get('label')
        evi = PPKt.Evidence()
        evi.evidence_code.CopyFrom(evidence_term)
        reference_json = evidence.get('reference')
        if reference_json is not None:
            ref = PPKt.ExternalReference()
            ref.id = reference_json.get('id')
            ref.description = reference_json.get('description')
            evi.reference.CopyFrom(ref)
        pfeature.evidence.append(evi)
        pfeature_list.append(pfeature)
    return pfeature_list

<h2>Disease</h2>
<pre>'diseases': [{'term': {'id': 'OMIM:191900', 'label': 'MUCKLE-WELLS SYNDROME; MWS'}}]</pre>

In [132]:
def get_disease(json_dict):
    if not 'diseases' in json_dict:
        raise ValueError("Could not extract disease field")
    disease_list = json_dict.get('diseases')
    if len(disease_list) != 1:
        raise ValueError(f"Got bad number of diseases, {len(disease_list)}")
    disease_object = disease_list[0]
    return disease_object.get('term')

In [133]:
def get_gene(json_dict):
    """
    We expect one and only one gene here
    """
    if not 'genes' in json_dict:
        raise ValueError("Could not extract genes field")
    gene_list = json_dict.get('genes')
    if len(gene_list) != 1:
        raise ValueError(f"Got bad number of genes, {len(gene_list)}")
    return gene_list[0]

<h2>Variants in v1 phenopacket</h2>
<pre>
 'variants': [{'vcfAllele': {'genomeAssembly': 'GRCh37', 'chr': '1', 'pos': 247587794, 'ref': 'C', 'alt': 'T'}, 
    'zygosity': {'id': 'GENO:0000135', 'label': 'heterozygous'}}]
 </pre>

In [134]:
def get_one_ga4gh_interpretation(vcf_allele, zygosity, gene, disease, var_id):
    vdescriptor = PPKt.VariationDescriptor()
    vdescriptor.id = var_id
    vdescriptor.gene_context.value_id = gene.get('id')
    vdescriptor.gene_context.symbol = gene.get('symbol')
    vdescriptor.molecule_context =  PPKt.MoleculeContext.genomic
    vdescriptor.allelic_state.id = zygosity.get('id')
    vdescriptor.allelic_state.label = zygosity.get('label')
    vinterpretation = PPKt.VariantInterpretation() 
    vcf_record = PPKt.VcfRecord()
    vcf_record.genome_assembly =  vcf_allele.get('genomeAssembly')
    vcf_record.chrom = vcf_allele.get('chr')
    vcf_record.pos = vcf_allele.get('pos')
    vcf_record.ref = vcf_allele.get('ref')
    vcf_record.alt = vcf_allele.get('alt')
    vdescriptor.vcf_record.CopyFrom(vcf_record)
    vinterpretation.variation_descriptor.CopyFrom(vdescriptor)
    return vinterpretation

In [135]:
def get_interpretation(json_dict, gene, disease, individual_id):
    if not 'variants' in json_dict:
        raise ValueError("Could not find variants")
    ga4gh_var_list = []
    i = 0
    for v in json_dict.get('variants'):
        if 'vcfAllele' not in v:
            raise ValueError("Malformed variant")
        if 'zygosity' not in v:
            raise ValueError("Malformed variant - no zygosity")
        vcf_allele = v.get('vcfAllele')
        zygosity = v.get('zygosity')
        i += 1
        var_id = f"variant-{i}"
        ga4gh_int = get_one_ga4gh_interpretation(vcf_allele, zygosity, gene, disease, var_id)
        ga4gh_var_list.append(ga4gh_int)
    interpretation = PPKt.Interpretation()
    interpretation.id = "interpretation-id"
    interpretation.progress_status = PPKt.Interpretation.ProgressStatus.SOLVED
    interpretation.diagnosis.disease.id = disease.get('id')
    interpretation.diagnosis.disease.label = disease.get('label')
    for var in ga4gh_var_list:
        genomic_interpretation = PPKt.GenomicInterpretation()
        genomic_interpretation.subject_or_biosample_id = individual_id
        # by assumption, variants passed to this package are all causative
        genomic_interpretation.interpretation_status = PPKt.GenomicInterpretation.InterpretationStatus.CAUSATIVE
        genomic_interpretation.variant_interpretation.CopyFrom(var)
        interpretation.diagnosis.genomic_interpretations.append(genomic_interpretation)
    return interpretation
        
    

In [141]:
def construct_v2_phenopacket(json_dict):
    id = get_id(json_dict)
    print(f"id={id}")
    proband_id, iso_age, sex= get_subject(json_dict)
    print(f"proband_id = {proband_id}, age={iso_age}, sex={sex}")
    pfeatures_list = get_phenotypic_features_list(json_dict)
    print(f"Got {len(pfeatures_list)} pheno features")
    gene = get_gene(json_dict)
    disease = get_disease(json_dict)
    interpretation = get_interpretation(json_dict, gene, disease, proband_id)
    # Create phenopacket
    phenopacket = PPKt.Phenopacket()
    phenopacket.id = id
    proband = PPKt.Individual()
    proband.id = proband_id
    if sex == "MALE":
        proband.sex  = PPKt.Sex.MALE
    elif sex == "FEMALE":
        proband.sex = PPKt.Sex.FEMALE
    else:
        proband.sex  = "UNKNOWN_SEX"
    proband.time_at_last_encounter.age.iso8601duration = iso_age
    phenopacket.subject.CopyFrom(proband)
    for pf in pfeatures_list:
        phenopacket.phenotypic_features.append(pf)
    phenopacket.interpretations.append(interpretation)
    # TODO METADATA
    return phenopacket
    

In [142]:
ppack = construct_v2_phenopacket(json_dict)
print(ppack)

id=PMID:27435956-Naz_Villalba-2016-NLRP3-proband
proband_id = proband, age=P5Y, sex=FEMALE
Got 9 pheno features
id: "PMID:27435956-Naz_Villalba-2016-NLRP3-proband"
subject {
  id: "proband"
  time_at_last_encounter {
    age {
      iso8601duration: "P5Y"
    }
  }
  sex: FEMALE
}
phenotypic_features {
  type {
    id: "HP:0011227"
    label: "Elevated C-reactive protein level"
  }
  evidence {
    evidence_code {
      id: "ECO:0000033"
      label: "author statement supported by traceable reference"
    }
    reference {
      id: "PMID:27435956"
      description: "Muckle-Wells Syndrome: A Case Report with an NLRP3 T348M Mutation"
    }
  }
}
phenotypic_features {
  type {
    id: "HP:0000509"
    label: "Conjunctivitis"
  }
  evidence {
    evidence_code {
      id: "ECO:0000033"
      label: "author statement supported by traceable reference"
    }
    reference {
      id: "PMID:27435956"
      description: "Muckle-Wells Syndrome: A Case Report with an NLRP3 T348M Mutation"
    }