Skip to content

Commit

Permalink
Fix some edge cases, add tests, reduce complexity
Browse files Browse the repository at this point in the history
  • Loading branch information
krassowski committed Jun 2, 2019
1 parent ce889e9 commit e95efd7
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 46 deletions.
98 changes: 57 additions & 41 deletions website/imports/mutations/clinvar.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from collections import defaultdict
from typing import Mapping

Expand Down Expand Up @@ -36,17 +37,21 @@ class ClinVarImporter(MutationImporter):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.xml_path = None
self.skipped_significances = defaultdict(int)

def load(self, path=None, update=False, clinvar_xml_path=None, **ignored_kwargs):
self.xml_path = clinvar_xml_path or self.default_xml_path
super().load(path, update, **ignored_kwargs)

@staticmethod
def _beautify_disease_name(name):
# edge cases:
# B_Lymphoblastic_Leukemia/Lymphoma_with_t(v%3B11q23.3)%3B_KMT2A_Rearranged
# Ataxia___Neurologic_
return name.replace('___', ' _ ').replace('_', ' ').replace('%3B', ';')
if '___' in name:
# for edge cases use a robust regexpr
name = re.sub('([^_])_([^_])', r'\1 \2', name).replace('___', ' _ ')
else:
# but for 99% of data, simple .replace() is much faster:
name = name.replace('_', ' ')
return name.replace('%3B', ';')

def iterate_lines(self, path):
return tsv_file_iterator(path, self.header, file_opener=gzip_open_text)
Expand Down Expand Up @@ -79,25 +84,52 @@ def test_line(self, line):
for code, name in ClinicalData.significance_codes.items()
}

significance_map = {
'pathologic': 'pathogenic',
'probable-pathogenic': 'likely pathogenic',
'cancer': 'pathogenic',
'untested': 'not provided',
'variant of unknown significance': 'uncertain significance',
'uncertain': 'uncertain significance',
'drug-response': 'drug response',
'probable-non-pathogenic': 'likely benign',
'probably not pathogenic': 'likely benign',
'non-pathogenic': 'benign',
}

def parse_significance(self, significance):

significance = significance.lower()

if significance in self.significance_map:
significance = self.significance_map[significance]

additional_significances = []

if significance not in self.inverse_significance_map:
assign_to = 'other'

first_significance, *additional_significances = significance.split(',')

if first_significance in self.inverse_significance_map:
assign_to = first_significance

if significance not in self.skipped_significances:
print(f'Unmapped significance status: "{significance}", assigning "{assign_to}"')

self.skipped_significances[significance] += 1

significance = assign_to

sig_code = self.inverse_significance_map[significance]

return sig_code, [sig.strip() for sig in additional_significances]

def import_disease_associations(self):
from xml.etree import ElementTree
from os.path import getsize
from tqdm import tqdm
import gzip

significance_map = {
'pathologic': 'pathogenic',
'probable-pathogenic': 'likely pathogenic',
'cancer': 'pathogenic',
'untested': 'not provided',
'variant of unknown significance': 'uncertain significance',
'uncertain': 'uncertain significance',
'drug-response': 'drug response',
'probable-non-pathogenic': 'likely benign',
'probably not pathogenic': 'likely benign',
'non-pathogenic': 'benign',
}

ignored_traits = {
'not specified',
'not provided'
Expand All @@ -108,7 +140,7 @@ def import_disease_associations(self):
'variation to included disease',
'confers sensitivity'
}
skipped_significances = defaultdict(int)
self.skipped_significances = defaultdict(int)

accepted_species = {'Human', 'human'}
skipped_species = set()
Expand Down Expand Up @@ -213,15 +245,15 @@ def import_disease_associations(self):

try:
disease = Disease.query.filter_by(name=trait_name).one()
except:
except NoResultFound:
resolved = False
if 'Mucolipidosis, Type' in trait_name:
print(f'Working around changed name for {trait_name}')
trait_name = trait_name.replace('Mucolipidosis, Type', 'Mucolipidosis')
try:
disease = Disease.query.filter_by(name=trait_name).one()
resolved = True
except:
except NoResultFound:
pass

if not resolved:
Expand All @@ -236,7 +268,7 @@ def import_disease_associations(self):
action = ''
if trait_type == 'Disease':
disease.clinvar_type = trait_type
action = ': overwritting the old type with "Disease"'
action = ': overwriting the old type with "Disease"'
print(f'Conflicting trait types for "{disease.name}": "{disease.clinvar_type}" != "{trait_type}"{action}')
else:
disease.clinvar_type = trait_type
Expand All @@ -247,26 +279,10 @@ def import_disease_associations(self):

significance_annotation = significance_annotations[0]

significance = significance_annotation.find('Description').text.lower()
significance = significance_annotation.find('Description').text
review_status = significance_annotation.find('ReviewStatus').text

if significance in significance_map:
significance = significance_map[significance]

additional_significances = None

if significance not in self.inverse_significance_map:
skipped_significances[significance] += 1
if significance not in skipped_significances:
assign_to = 'other'
first_significance, *additional_significances = significance.split(',')
if first_significance in self.inverse_significance_map:
assign_to = first_significance
print(f'Unmapped significance status: "{significance}", assigning "{assign_to}"')
significance = assign_to
significance = 'other'

sig_code = self.inverse_significance_map[significance]
sig_code, additional_significances = self.parse_significance(significance)

disease_associations: ClinicalData = (
ClinicalData.query
Expand Down Expand Up @@ -295,7 +311,7 @@ def import_disease_associations(self):
root.clear()

print(skipped_diseases)
print(skipped_significances)
print(self.skipped_significances)

db.session.commit()

Expand Down
25 changes: 20 additions & 5 deletions website/tests/test_imports/test_import_mutations.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from typing import Dict

import pytest

from imports.mutations import MutationImportManager, MutationImporter
from database_testing import DatabaseTest
from imports.mutations import MutationImportManager, MutationImporter
from imports.mutations.clinvar import ClinVarImporter
from models import (
Protein, InheritedMutation, Disease, ExomeSequencingMutation, The1000GenomesMutation, MIMPMutation,
Site, SiteType,
Mutation,
ClinicalData,
Mutation
)
from models import MC3Mutation
from database import db
Expand Down Expand Up @@ -196,6 +194,23 @@ def test_hypermutated_finder(self):
assert sample == 'TCGA-02-0003-01A-01D-1490-08'
assert count == 3

def test_clinvar_disease_names(self):
beutify = ClinVarImporter._beautify_disease_name
assert beutify('B_Lymphoblastic_Leukemia/Lymphoma_with_t(v%3B11q23.3)%3B_KMT2A_Rearranged') == 'B Lymphoblastic Leukemia/Lymphoma with t(v;11q23.3); KMT2A Rearranged'
assert beutify('Ataxia___Neurologic_(child_onset)') == 'Ataxia _ Neurologic (child onset)'

def test_clinvar_significance(self):
clinvar = ClinVarImporter()
pathogenic_code = clinvar.inverse_significance_map['pathogenic']

sig_code, additional_significances = clinvar.parse_significance('Pathogenic, association')
assert sig_code == pathogenic_code
assert additional_significances == ['association']

sig_code, additional_significances = clinvar.parse_significance('Pathogenic')
assert sig_code == pathogenic_code
assert additional_significances == []

def test_clinvar_import(self):
muts_filename = make_named_gz_file(clinvar_mutations)
proteins = create_proteins({**tp53, **lama4, **msh2, **tp53_alt})
Expand Down

0 comments on commit e95efd7

Please sign in to comment.