This is the notebook to run add all the anotation, run the scoring system and obtain the output.

# Step 1: Import all the packages and create a duckdb file

In [1]:
# import all the package required in this system
import duckdb as db
import pandas as pd
from src import add_annotation
from src import scoring
from src import extraction
from src import validation
from src import histogram

In [None]:
# establish and connect duckdb
file_path = "scoring.db" 
con = db.connect(file_path) # create a duckdb file with and connect it

# Step 2: Read patient dataset that you would like to do the scoring

In [None]:
tsv_file_path = "data/data/random_data.tsv" 
test_data = pd.read_csv(tsv_file_path, sep='\t') # modify to the name you would like 

# Step 3: Add all annotations, except alpha missese

In [None]:
# mgi
file_path_mouse_marker = "data/data/mouse_notes.csv"
file_path_human_mouse_symbol = "data/data/HMD_HumanPhenotype.rpt.txt"
mgi_df, human_mouse_symbol = add_annotation.mgi_data(file_path_mouse_marker, file_path_human_mouse_symbol)

# gene otonology
gene_dict = add_annotation.read_gmt(file_path = 'data/data/c5.go.v2023.2.Hs.symbols.gmt') 

# revel 
revel_file_path= "data/data/revel_with_transcript_ids_small"
ids_dictionary = add_annotation.create_revel_dictionary(revel_file_path) 

# gnomad 4.1 
file_path = "data/data/gnomad.v4.1.constraint_metrics.tsv"
filtered_gnomad_4_1 = add_annotation.filted_gnomad_4_1(file_path)



In [None]:
test_data = add_annotation.adding_all_an(dataset= test_data, mgi_df = mgi_df, gene_dict = gene_dict, ids_dictionary = ids_dictionary, filtered_gnomad_4_1 = filtered_gnomad_4_1, human_mouse_symbol = human_mouse_symbol, dataset_name=None, file_path=None)
# you may save the whole tsv with all annotations 
# file_path = ""
# test_data.to_csv(file_path, sep='\t', index=False) 
# extract only the required attributes
combined = add_annotation.extract_data(test_data)
file_path = "data/test_data.tsv" # change to your file path 
combined.to_csv(file_path, sep='\t', index=False) 

# Step 4: Load patient data to duckdb

In [None]:
con.execute("CREATE OR REPLACE TABLE test_data AS SELECT * FROM 'data/test_data.tsv';") # replace with your own file path to the extracted dataset
con.execute("CREATE OR REPLACE TABLE data_freq AS SELECT uniqueValue, distinct_patientDatabaseId_count, variant_frequency FROM 'data/data/data_freq.csv';")
scoring.add_col(con,'test_data')


Adding alpha missense annotation

In [None]:
# IMPORTANT only have to run it ONCE
file_path = "data/data/AlphaMissense_hg38.tsv" 
add_annotation.am_df(con, file_path)

In [None]:
# Since alpha missense is a huge dataset, adding it within duckdb is more efficient
add_annotation.add_am_path(con,'data') 

# Step 5: Apply the scoring system

In [15]:
scoring.scoring(con, 'test_data')


# Step 6: Obtain the output table

In [16]:
test_data_result = scoring.final_score(con, 'test_data')
# To save it
file_path = "result/test_data_result.tsv"
test_data_result.to_csv(file_path, sep='\t', index=False) 

# Step 7: Visualisation (only when you have targeted vairants)

#### Histogram
To show the distribution of genetic variants score in one or mulitple patients. 

Multiple table and multiple variant from each table could be applied.

In [None]:
# to show the histogram set a tuples as dataset
# the tuple format should be (output table, one targeted variant, patient number)
datasets = [
    (test_data_result, 'chr4:54295165A>T', '1'),  
    (test_data_result, 'chr19:1231143T>C', '2'), 
    (test_data_result, 'chr17:50375415A>C', '3'),
    (test_data_result, 'chr1:89186388T>C', '4')
    ]

histogram.histogram(datasets)

#### Validation Table 
To find all the position values of targeted genetic variants in each patient (including score, top percentage, the ranking (top), number of variants above this score, etc)

Multiple table and multiple variant from each table could be applied

In [18]:
table_and_variant_dict = {'test_data_result':(test_data_result, ["chr4:54295165A>T", "chr1:89186388T>C", "chr19:1231143T>C"])}
validate_table_after = validation.validate_table(table_and_variant_dict)

# Step 8: Extracting all records (annotations + score) of target variants
Multiple table and multiple variant from each table could be applied

In [19]:
# set a dict to obtain a table with all the target variants from different tables 
# # the structure of the dict {table1: ['var1', 'var2'...], table2: ['var1', 'var2'...],...}
table_and_variant_dict = {"test_data": ["chr4:54295165A>T", "chr1:89186388T>C"]} 
table_name = "testing"
targeted_variants = extraction.pathogenic_variant_table(con, table_and_variant_dict, table_name)