# Notebook for demonstrating evidence matching between assayed fusions and categorical fusions

In [1]:
import warnings
from os import environ

warnings.filterwarnings("ignore")

# These are the configurations for the gene normalizer and UTA databases. These should
# be adjusted by the user based on the locations where these databases exist.
environ["GENE_NORM_DB_URL"] = "postgresql://postgres@localhost:5432/gene_normalizer"
environ["UTA_DB_URL"] = "postgresql://uta_admin:uta@localhost:5432/uta/uta_20240523b"

### Load FUSOR and Translator modules
Run the cell below to load the FUSOR and Translator modules

In [2]:
from civicpy import civic

from fusor.fusor import FUSOR
from fusor.translator import Translator

fusor = FUSOR()
translator = Translator(fusor=fusor)

### Generate list of AssayedFusion objects from STAR-Fusion file
Run the cell below to generate a list of AssayedFusion objects from a file of STAR-Fusion output

In [3]:
# Generate AssayedFusion list from STAR-Fusion file
from pathlib import Path

from cool_seq_tool.schemas import Assembly

from fusor.harvester import StarFusionHarvester

path = Path("../../tests/fixtures/star-fusion.fusion_predictions.abridged.tsv")
harvester = StarFusionHarvester(translator=translator, assembly=Assembly.GRCH38.value)
fusions_list = await harvester.load_records(path)

assayed_fusion_star_fusion = fusions_list[:2]

Unable to get MANE Transcript data for gene: RN7SKP80
Could not find a transcript for RN7SKP80 on NC_000022.11
Unable to get MANE Transcript data for gene: RN7SKP118
Could not find a transcript for RN7SKP118 on NC_000016.10
Gene does not exist in UTA: AC021660.2
Unable to get MANE Transcript data for gene: EEF1A1P13
Could not find a transcript for EEF1A1P13 on NC_000005.10
Gene does not exist in UTA: AC098590.1
Gene does not exist in UTA: AC099789.1
Unable to get MANE Transcript data for gene: USP27X-DT
38584945 on NC_000021.9 occurs more than 150 bp outside the exon boundaries of the NM_182918.4 transcript, indicating this may not be a chimeric transcript junction and is unlikely to represent a contiguous coding sequence. Confirm that the genomic position 38584945 is being used to represent transcript junction and not DNA breakpoint.
Unable to get MANE Transcript data for gene: LINC00158
Gene does not exist in UTA: AP001341.1
Gene does not exist in UTA: AC021660.2
Gene does not exist 

### Load CIViC fusion variants
Run the cell below to load accepted fusion variants from the CIViC knowledgebase

In [4]:
# Load in accepted fusion variants
variants = civic.get_all_fusion_variants(include_status="accepted")

In [5]:
partners = ("KIF5B", "RET", "EML4", "ALK")
for fusion in variants:
    if any(partner in fusion.vicc_compliant_name for partner in partners):
        print(fusion.vicc_compliant_name)

EML4(entrez:27436)::ALK(entrez:238)
KIF5B(entrez:3799)::RET(entrez:5979)
v::ALK(entrez:238)
NPM1(entrez:4869)::ALK(entrez:238)
RANBP2(entrez:5903)::ALK(entrez:238)
CLTC(entrez:1213)::ALK(entrez:238)
CCDC6(entrez:8030)::RET(entrez:5979)
v::RET(entrez:5979)
STRN(entrez:6801)::ALK(entrez:238)
CAD(entrez:790)::ALK(entrez:238)
KANK4(entrez:163782)::ALK(entrez:238)
EML4(entrez:27436)::NTRK3(entrez:4916)
KIF5B(entrez:3799)::EGFR(entrez:1956)
HIP1(entrez:3092)::ALK(entrez:238)
ENST00000318522.5(EML4):e.20::ENST00000389048.3(ALK):e.20
ENST00000318522.5(EML4):e.2::ENST00000389048.3(ALK):e.20
ENST00000318522.5(EML4):e.6::ENST00000389048.3(ALK):e.20


The output above lists all possible categorical fusions with KIF5B, RET, EML4, and ALK 
as a partner. We expect matches for KIF5B(entrez:3799)::RET(entrez:5979) as its 
partners are both KIF5B and RET (and have equivalent breakpoint locations), and 
v::RET(entrez:5979), as its 5' partner is a multiple possible genes element and 
its 3' partner is RET. For the EML4::ALK fusion, we would expect a match for the 
EML4(entrez:27436)::ALK(entrez:238) fusion, as this fusion describes the joining of 
exon 13 of EML4 with exon 20 of ALK, which also describes the assayed fusion. Note 
that the other EML4::ALK categorical fusions indicate the joining of exons that do not
match the queried assayed fusion. v::ALK(entrez:238) would also be a match following 
the same logic as v::RET(entrez:5979).

### Run FusionMatcher to gather objects containing standardized fusion knowledge
Run the cell below to use FusionMatcher to extract standardized knowledge for the two fusions extracted from the STAR-Fusion file (KIF5B::RET and EML4::ALK). The score for each matching CategoricalFusion is printed at the bottom of the cell.

In [6]:
# Generate list of matches, report match score
from fusor.fusion_matching import FusionMatcher
from fusor.harvester import CIVICHarvester

# Save categorical fusions cache and create FusionSet
harvester = CIVICHarvester(translator=translator)
harvester.fusions_list = variants
civic_fusions = await harvester.load_records()

# Initialize FusionMatcher and define sources to match against
fm = FusionMatcher(assayed_fusions=assayed_fusion_star_fusion,
                   categorical_fusions=civic_fusions)

# Generate list of matching fusions
matches = await fm.match_fusion()
for matching_output in matches:
    for match in matching_output:
        print(f"Match Score: {match[1]}")



Match Score: 10
Match Score: 1
Match Score: 10
Match Score: 5


### View matching categorical fusions
Run the cells below to view the matching CategoricalFusion objects for each queried AssayedFusion object.

#### KIF5B::RET

In [7]:
# Print highest quality match for KIF5B::RET
matches[0][0][0].model_dump(exclude_none=True)

{'type': <FUSORTypes.CATEGORICAL_FUSION: 'CategoricalFusion'>,
 'structure': [{'type': <FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT: 'TranscriptSegmentElement'>,
   'transcript': 'refseq:NM_004521.3',
   'strand': <Strand.NEGATIVE: -1>,
   'exonEnd': 24,
   'exonEndOffset': 0,
   'gene': {'conceptType': 'Gene',
    'name': 'KIF5B',
    'primaryCoding': {'id': 'hgnc:6324',
     'system': 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/',
     'code': 'HGNC:6324'}},
   'elementGenomicEnd': {'id': 'ga4gh:SL.nk8wv9yKzCFQ0n7Ph2JnJhOkf2Fzfh_U',
    'type': 'SequenceLocation',
    'digest': 'nk8wv9yKzCFQ0n7Ph2JnJhOkf2Fzfh_U',
    'sequenceReference': {'id': 'refseq:NC_000010.11',
     'type': 'SequenceReference',
     'refgetAccession': 'SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB'},
    'start': 32017142}},
  {'type': <FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT: 'TranscriptSegmentElement'>,
   'transcript': 'refseq:NM_020975.6',
   'strand': <Strand.POSITIVE: 1>,
   'exonStart': 11,
   'exonStart

In [8]:
# Print second match for KIF5B::RET
matches[0][1][0].model_dump(exclude_none=True)

{'type': <FUSORTypes.CATEGORICAL_FUSION: 'CategoricalFusion'>,
 'structure': [{'type': <FUSORTypes.MULTIPLE_POSSIBLE_GENES_ELEMENT: 'MultiplePossibleGenesElement'>},
  {'type': <FUSORTypes.GENE_ELEMENT: 'GeneElement'>,
   'gene': {'conceptType': 'Gene',
    'name': 'RET',
    'primaryCoding': {'id': 'hgnc:9967',
     'system': 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/',
     'code': 'HGNC:9967'}}}],
 'viccNomenclature': 'v::RET(hgnc:9967)',
 'civicMolecularProfiles': [<CIViC molecular_profile 1595>,
  <CIViC molecular_profile 4424>]}

#### EML4::ALK

In [9]:
# Print highest quality match for EML4::ALK
matches[1][0][0].model_dump(exclude_none=True)

{'type': <FUSORTypes.CATEGORICAL_FUSION: 'CategoricalFusion'>,
 'structure': [{'type': <FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT: 'TranscriptSegmentElement'>,
   'transcript': 'refseq:NM_019063.5',
   'strand': <Strand.POSITIVE: 1>,
   'exonEnd': 13,
   'exonEndOffset': 0,
   'gene': {'conceptType': 'Gene',
    'name': 'EML4',
    'primaryCoding': {'id': 'hgnc:1316',
     'system': 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/',
     'code': 'HGNC:1316'}},
   'elementGenomicEnd': {'id': 'ga4gh:SL.PQzV-kfeCQ4MBmxD5mSHqZmId3I_f-Ib',
    'type': 'SequenceLocation',
    'digest': 'PQzV-kfeCQ4MBmxD5mSHqZmId3I_f-Ib',
    'sequenceReference': {'id': 'refseq:NC_000002.12',
     'type': 'SequenceReference',
     'refgetAccession': 'SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g'},
    'end': 42295516}},
  {'type': <FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT: 'TranscriptSegmentElement'>,
   'transcript': 'refseq:NM_004304.5',
   'strand': <Strand.NEGATIVE: -1>,
   'exonStart': 20,
   'exonStartOff

In [10]:
# Print second match for EML4::ALK
matches[1][1][0].model_dump(exclude_none=True)

{'type': <FUSORTypes.CATEGORICAL_FUSION: 'CategoricalFusion'>,
 'structure': [{'type': <FUSORTypes.MULTIPLE_POSSIBLE_GENES_ELEMENT: 'MultiplePossibleGenesElement'>},
  {'type': <FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT: 'TranscriptSegmentElement'>,
   'transcript': 'refseq:NM_004304.5',
   'strand': <Strand.NEGATIVE: -1>,
   'exonStart': 20,
   'exonStartOffset': 0,
   'gene': {'conceptType': 'Gene',
    'name': 'ALK',
    'primaryCoding': {'id': 'hgnc:427',
     'system': 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/',
     'code': 'HGNC:427'}},
   'elementGenomicStart': {'id': 'ga4gh:SL.Eu_igVd9zOahn3tFN-pyxtphUmrSlRAh',
    'type': 'SequenceLocation',
    'digest': 'Eu_igVd9zOahn3tFN-pyxtphUmrSlRAh',
    'sequenceReference': {'id': 'refseq:NC_000002.12',
     'type': 'SequenceReference',
     'refgetAccession': 'SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g'},
    'end': 29223528}}],
 'viccNomenclature': 'v::NM_004304.5(ALK):e.20',
 'civicMolecularProfiles': [<CIViC molecular_pr

### View Standardized Evidence for each matching CategoricalFusion object
Run the cells below to view an associated evidence item for a matching CategoricalFusion object

#### KIF5B::RET

In [11]:
# View evidence item linked to matched KIF5B::RET categorical fusion
matches[0][0][0].civicMolecularProfiles[0].evidence_items[0].__dict__

{'_assertions': [],
 '_therapies': [<CIViC therapy 117>],
 '_phenotypes': [],
 '_incomplete': {'phenotypes', 'therapies'},
 '_partial': False,
 'type': 'evidence',
 'id': 698,
 'variant_origin': 'SOMATIC',
 'therapy_interaction_type': None,
 'therapy_ids': [117],
 'status': 'accepted',
 'source_id': 378,
 'significance': 'SENSITIVITYRESPONSE',
 'rating': 2,
 'phenotype_ids': [],
 'name': 'EID698',
 'molecular_profile_id': 269,
 'evidence_type': 'PREDICTIVE',
 'evidence_level': 'C',
 'evidence_direction': 'SUPPORTS',
 'disease_id': 30,
 'description': 'A case study of a patient with EGFR, KRAS, BRAF, HER2, ALK, ROS1 and MET negative adenocarcinoma of the lung. FISH analysis revealed a KIF5B-RET fusion. The RET inhibitor Vandetanib led to remission in the patient.',
 'assertion_ids': [],
 '_include_status': ['accepted', 'submitted', 'rejected']}

In [12]:
# View evidence item linked to matched v::RET categorical fusion
matches[0][1][0].civicMolecularProfiles[0].evidence_items[0].__dict__

{'_assertions': [<CIViC assertion 78>],
 '_therapies': [<CIViC therapy 601>],
 '_phenotypes': [],
 '_incomplete': {'phenotypes', 'therapies'},
 '_partial': False,
 'type': 'evidence',
 'id': 8852,
 'variant_origin': 'SOMATIC',
 'therapy_interaction_type': None,
 'therapy_ids': [601],
 'status': 'accepted',
 'source_id': 3693,
 'significance': 'SENSITIVITYRESPONSE',
 'rating': 4,
 'phenotype_ids': [],
 'name': 'EID8852',
 'molecular_profile_id': 1595,
 'evidence_type': 'PREDICTIVE',
 'evidence_level': 'A',
 'evidence_direction': 'SUPPORTS',
 'disease_id': 16,
 'description': 'In this phase 1/2 trial (NCT03157128), patients with RET altered thyroid cancers were enrolled to receive the highly selective RET inhibitor selpercatinib. Among 19 patients with previously treated RET fusion positive thyroid cancer, the percentage who had a response was 79% (95% CI, 54 to 94), and 1-year progression-free survival was 64% (95% CI, 37 to 82). Responses included 8 of 11 (73%) papillary thyroid cancer

### EML4::ALK

In [13]:
# View evidence item linked to matched EML4::ALK categorical fusion
matches[1][0][0].civicMolecularProfiles[0].evidence_items[0].__dict__

{'_assertions': [<CIViC assertion 3>],
 '_therapies': [<CIViC therapy 12>],
 '_phenotypes': [],
 '_incomplete': {'phenotypes', 'therapies'},
 '_partial': False,
 'type': 'evidence',
 'id': 262,
 'variant_origin': 'SOMATIC',
 'therapy_interaction_type': None,
 'therapy_ids': [12],
 'status': 'accepted',
 'source_id': 166,
 'significance': 'SENSITIVITYRESPONSE',
 'rating': 4,
 'phenotype_ids': [],
 'name': 'EID262',
 'molecular_profile_id': 5,
 'evidence_type': 'PREDICTIVE',
 'evidence_level': 'C',
 'evidence_direction': 'SUPPORTS',
 'disease_id': 30,
 'description': 'A 28 year-old patient with non-small cell lung cancer that failed conventional therapy was found to harbor the EML4-ALK (E13;A20) fusion using reverse transcription PCR. Treatment with 250mg crizotinib twice daily resulted in rapid improvement of symptoms and disease control for 5 months.',
 'assertion_ids': [3],
 '_include_status': ['accepted', 'submitted', 'rejected']}

In [14]:
# View evidence item linked to matched EML4::ALK categorical fusion
matches[1][1][0].civicMolecularProfiles[0].evidence_items[0].__dict__

{'_assertions': [<CIViC assertion 3>],
 '_therapies': [<CIViC therapy 12>],
 '_phenotypes': [],
 '_incomplete': {'phenotypes', 'therapies'},
 '_partial': False,
 'type': 'evidence',
 'id': 1187,
 'variant_origin': 'SOMATIC',
 'therapy_interaction_type': None,
 'therapy_ids': [12],
 'status': 'accepted',
 'source_id': 819,
 'significance': 'SENSITIVITYRESPONSE',
 'rating': 5,
 'phenotype_ids': [],
 'name': 'EID1187',
 'molecular_profile_id': 495,
 'evidence_type': 'PREDICTIVE',
 'evidence_level': 'A',
 'evidence_direction': 'SUPPORTS',
 'disease_id': 8,
 'description': 'In the Phase I study PROFILE 1001 (NCT00585195), a recommended crizotinib dose of 250 mg twice daily for 28 day cycles was established. Among 1,500 advanced NSCLC patients who were screened for ALK-rearrangement using a break-apart FISH assay, 82 patients were eligible for crizotinib treatment. Overall response rate was 57%, with 46 partial responses and one complete response. Since crizotinib inhibits MET, 33 patients w