In [1]:
import oakvar as ov

## Install and setup

##### Install

In [None]:
!pip install -U oakvar

##### Setup

In [None]:
ov.system.setup(outer=ov.stdouter)

### Login and logout

_Check login state_

In [None]:
ov.store.account.check()

_Logout_

In [None]:
ov.store.logout(outer=ov.stdouter)

_Login with email and password arguments_

In [None]:
ov.store.login(email="your@email", pw="your_password")

_Login interactively_

In [None]:
ov.store.login(interactive=True)

_Change password_

In [None]:
ov.store.account.change(outer=ov.stdouter)

## Module management

_List installed modules_

In [10]:
ov.module.ls()

[{'name': '23andme-converter',
  'title': '23andMe Converter',
  'type': 'converter',
  'size': '21.9 kB',
  'version': '1.5.0',
  'data_source': ''},
 {'name': 'abraom',
  'title': 'ABRaOM',
  'type': 'annotator',
  'size': '113.6 MB',
  'version': '1.0.0',
  'data_source': ''},
 {'name': 'af-package',
  'title': 'Allele Frequency Package',
  'type': 'package',
  'size': '881 B',
  'version': '0.0.1',
  'data_source': ''},
 {'name': 'af2-package',
  'title': 'Allele Frequency Package',
  'type': 'package',
  'size': '790 B',
  'version': '0.0.1',
  'data_source': ''},
 {'name': 'aggregator',
  'title': 'Aggregator',
  'type': 'aggregator',
  'size': '32.5 kB',
  'version': '1.0.2',
  'data_source': ''},
 {'name': 'alfa',
  'title': 'ALFA: Allele Frequency Aggregator',
  'type': 'annotator',
  'size': '2.4 GB',
  'version': '1.0.0',
  'data_source': '2020.02.29'},
 {'name': 'alfa_african',
  'title': 'ALFA: Allele Frequency Aggregator African',
  'type': 'annotator',
  'size': '1.9 MB'

_List modules available through the OakVar Store. Pattern matching can be used. `outer` is used in this example._

In [5]:
_ = ov.module.ls(available=True, module_names=["clin.*"], outer=ov.stdouter)

_Get information on one module_

In [2]:
ov.module.info(module_name="clinvar")

{'readme': '# ClinVar: public archive of interpretations of clinically relevant variants\nClinVar is a freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence. \n \nClinical Significance includes:  \nBenign, Likely benign, Uncertain significance, Likely pathogenic, Pathogenic, Drug response, Association, Risk factor, Protective, Affects, Conflicting data from submitters, Other and Not provided\n \nReview Status is the level of review supporting the assertion of clinical significance for the variation. There are 8 different statuses to consider, ranked from highest to lowest levels of review:\n\n1. Practice Guideline\n2. Reviewed by expert panel \n3. Criteria provided, multiple submitters, no conflicts \n4. Criteria provided, conflicting interpretations\n5. Criteria provided, single submitter\n6. No assertion for the individual variant \n7. No assertion criteria provided\n8. No assertion provided\n\nSee [Clinvar R

_Give outer=ov.stdouter to print the module information to stdout._

In [7]:
_ = ov.module.info(module_name="clinvar", outer=ov.stdouter)

_Install modules_

In [5]:
ov.module.install(module_names=["biogrid", "tsvreporter"], outer=ov.stdouter)

No store update to fetch
The following modules will be installed:
- biogrid==3.7.1
- tsvreporter==2.1.0


Proceed? ([y]/n) >  y


Installing biogrid...


biogrid==3.7.1 already exists.


Installing tsvreporter...


tsvreporter: Local version (2.1.1) is higher than the latest store version (2.1.0). Use --overwrite to overwrite.


_Uninstall modules_

In [8]:
ov.module.uninstall(module_names=["biogrid"], outer=ov.stdouter)

Uninstalling:
- biogrid


Proceed? (y/N) >  y


True

_Let's bring back the biogrid module back._

In [9]:
ov.module.install(module_names=["biogrid"], yes=True)

## Annotate and reports

##### Create an example input file in the current directory.

> This should create a file `exampleinput` in the current directory.

In [10]:
ov.api.new.exampleinput()

PosixPath('exampleinput')

##### Annotate the example input file with the default gene mapper.

> This should have created a database file `exampleinput.sqlite`.

In [None]:
ov.api.run(inputs=["exampleinput"])

##### Annotation data in the database file can be loaded into Polars dataframe.

> Variant level (default)

In [None]:
ov.get_df_from_db("exampleinput.sqlite")

> Gene level

In [None]:
ov.get_df_from_db("exampleinput.sqlite", "gene")

##### Annotate with ClinVar and BioGRID annotators. 

In [11]:
ov.run(inputs=["exampleinput"], annotators=["clinvar", "biogrid"])

2023/01/30 21:44:09 oakvar               /home/rick/mambaforge/envs/ov/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/rick/.local/share/jupyter/runtime/kernel-09c2490f-cd07-4709-88e2-ee822f2fe969.json
2023/01/30 21:44:09 oakvar               started: Mon Jan 30 21:44:09 2023
2023/01/30 21:44:09 oakvar               conf file: None
2023/01/30 21:44:09 oakvar               started OakVar
2023/01/30 21:44:09 oakvar               input file: /mnt/d/Git/oakvar/extras/notebooks/exampleinput
2023/01/30 21:44:09 oakvar               system: oakvar==2.7.35 /mnt/d/Git/oakvar/oakvar
2023/01/30 21:44:09 oakvar               module: clinvar==2022.06.14.1 /mnt/e/oakvar/annotators/clinvar/clinvar.py
2023/01/30 21:44:09 oakvar               module: biogrid==3.7.1 /mnt/e/oakvar/annotators/biogrid/biogrid.py
2023/01/30 21:44:09 oakvar               module: gencode==41.3.0 /mnt/e/oakvar/mappers/gencode/gencode.py
2023/01/30 21:44:09 oakvar               starting converter step...
2023/01/30 2

##### Generate a VCF file of the annotated variants.

In [13]:
ov.report(dbpath="exampleinput.sqlite", report_types=["vcf"], outer=ov.stdouter)

Generating vcf report...
report created: exampleinput.vcf


{'vcf': 'exampleinput.vcf'}

##### Annotation and report generation can be done at once. Also, reports of different formats can be generated in one go.

In [14]:
ov.run(inputs=["exampleinput"], annotators=["clinvar", "biogrid"], report_types=["vcf", "csv"], outer=ov.stdouter)

2023/01/30 21:51:31 oakvar               /home/rick/mambaforge/envs/ov/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/rick/.local/share/jupyter/runtime/kernel-09c2490f-cd07-4709-88e2-ee822f2fe969.json
2023/01/30 21:51:31 oakvar               started: Mon Jan 30 21:51:31 2023
2023/01/30 21:51:31 oakvar               conf file: None
2023/01/30 21:51:31 oakvar               started OakVar
2023/01/30 21:51:31 oakvar               input file: /mnt/d/Git/oakvar/extras/notebooks/exampleinput
2023/01/30 21:51:31 oakvar               system: oakvar==2.7.35 /mnt/d/Git/oakvar/oakvar
2023/01/30 21:51:31 oakvar               module: clinvar==2022.06.14.1 /mnt/e/oakvar/annotators/clinvar/clinvar.py
2023/01/30 21:51:31 oakvar               module: biogrid==3.7.1 /mnt/e/oakvar/annotators/biogrid/biogrid.py
2023/01/30 21:51:31 oakvar               module: gencode==41.3.0 /mnt/e/oakvar/mappers/gencode/gencode.py
2023/01/30 21:51:31 oakvar               module: vcfreporter==4.3.0 /mnt/e/oakva

{'vcf': '/mnt/d/Git/oakvar/extras/notebooks/exampleinput.vcf',
 'csv': ['/mnt/d/Git/oakvar/extras/notebooks/exampleinput.variant.csv']}

## Module development

### Annotator

In [7]:
import oakvar as ov
from oakvar import BaseAnnotator
class Annotator(BaseAnnotator):
    def annotate_df(self, df):
        return df
    def annotate(self, input_data: dict):
        out_data = input_data
        out_data["col1"] = "test_output"
        return out_data
m = Annotator(name="awesome", level="variant", output_columns=[{"name": "col1", "type": "string", "title": "Col1"}])

In [2]:
m = Annotator(name="awesome", title="Awesome annotator", level="variant", input_file="exampleinput.crv", 
              output_columns=[{"name": "col1", "type": "string", "title": "Col1"}],
              input_columns=["chrom", "pos", "ref_base", "alt_base", "note"])

In [10]:
m.save(overwrite=True)

In [8]:
m.run_df(df)

uid,chrom,pos,pos_end,ref_base,alt_base
i64,str,i64,i64,str,str
2,"""chr10""",2987654,2987654,"""T""","""A"""
3,"""chr10""",43077259,43077259,"""A""","""T"""
4,"""chr10""",8055656,8055656,"""A""","""T"""
5,"""chr10""",87864470,87864470,"""A""","""T"""
6,"""chr10""",87864486,87864486,"""A""","""-"""
7,"""chr10""",87864486,87864487,"""AA""","""-"""
8,"""chr10""",87894027,87894027,"""-""","""CG"""
9,"""chr10""",87894027,87894027,"""-""","""CT"""
10,"""chr1""",100719861,100719861,"""A""","""T"""
11,"""chr1""",10100,10100,"""C""","""T"""


In [11]:
m.run()

In [4]:
df=ov.read_crv("exampleinput.crv")

In [25]:
import polars as pl
f=open("exampleinput.crv")
c = 0
for line in f:
    if not line.startswith("#"):
        break
    c += 1
input_df=pl.read_csv(f, skip_rows=c, new_columns=["uid", "chrom", "pos", "pos_end", "ref_base", "alt_base"])

In [26]:
input_df

uid,chrom,pos,pos_end,ref_base,alt_base
i64,str,i64,i64,str,str
2,"""chr10""",2987654,2987654,"""T""","""A"""
3,"""chr10""",43077259,43077259,"""A""","""T"""
4,"""chr10""",8055656,8055656,"""A""","""T"""
5,"""chr10""",87864470,87864470,"""A""","""T"""
6,"""chr10""",87864486,87864486,"""A""","""-"""
7,"""chr10""",87864486,87864487,"""AA""","""-"""
8,"""chr10""",87894027,87894027,"""-""","""CG"""
9,"""chr10""",87894027,87894027,"""-""","""CT"""
10,"""chr1""",100719861,100719861,"""A""","""T"""
11,"""chr1""",10100,10100,"""C""","""T"""


In [3]:
mc = ov.MasterConverter(inputs=["exampleinput"])

In [4]:
mc.run()

{'total_lnum': 373,
 'write_lnum': 373,
 'error_lnum': 0,
 'input_format': ['cravat'],
 'assemblies': ['hg38']}

In [43]:
inspect.linecache.getlines(inspect.getfile(Annotator))

TypeError: <class '__main__.Annotator'> is a built-in class

In [62]:
print("".join(inspect.getsourcelines(Annotator)[0]))

class Annotator(ov.BaseAnnotator):
    def annotate(self, input_data: dict):
        out_data = input_data
        out_data["col1"] = "Test"
        return out_data



In [44]:
from IPython.core.magics.code import extract_symbols

In [47]:
cell_code = "".join(inspect.linecache.getlines(new_getfile(Annotator)))

In [48]:
cell_code

'import oakvar as ov\nclass Annotator(ov.BaseAnnotator):\n    def annotate(self, input_data: dict):\n        out_data = input_data\n        out_data["col1"] = "Test"\n        return out_data\nm = Annotator(module_name="awesome", level="variant", output_columns=[{"name": "col1", "type": "string", "title": "Col1"}])\n'

In [2]:
m = Annotator(module_name="awesome", level="variant", output_columns=[{"name": "col1", "type": "string", "title": "Col1"}])

@@@ module_conf={}. output_columns=[{'name': 'col1', 'type': 'string', 'title': 'Col1'}]
@@@ conf={}
@ 0 self.conf={}
@ output_columns=[{'name': 'col1', 'type': 'string', 'title': 'Col1'}]. self.output_columns=None. self.conf={'level': 'variant'}. module_conf={}
@@ self.output_columns=[{'name': 'col1', 'type': 'string', 'title': 'Col1'}]
@ conf={'level': 'variant', 'output_columns': [{'name': 'col1', 'type': 'string', 'title': 'Col1'}]}
@ => conf={'level': 'variant', 'output_columns': [{'name': 'uid', 'title': 'UID', 'type': 'int', 'width': 60, 'hidden': True, 'filterable': False}, {'name': 'col1', 'type': 'string', 'title': 'Col1'}]}


In [10]:
with open("text.py", "w") as wf:
    wf.write(str(Annotator))

In [24]:
print(type(Annotator))

<class 'type'>


In [39]:
inspect.getsourcelines(ov.BaseAnnotator)

(['class BaseAnnotator(object):\n',
  '\n',
  '    from ..util.util import get_crv_def\n',
  '    from ..util.util import get_crx_def\n',
  '    from ..util.util import get_crg_def\n',
  '    from ..consts import INPUT_LEVEL_KEY\n',
  '    from ..consts import VARIANT_LEVEL_KEY\n',
  '    from ..consts import GENE_LEVEL_KEY\n',
  '\n',
  '    valid_levels = ["variant", "gene"]\n',
  '    valid_input_formats = [INPUT_LEVEL_KEY, VARIANT_LEVEL_KEY, GENE_LEVEL_KEY]\n',
  '    id_col_defs = {"variant": get_crv_def()[0], "gene": get_crg_def()[0]}\n',
  '    default_input_columns = {\n',
  '        INPUT_LEVEL_KEY: [x["name"] for x in get_crv_def()],\n',
  '        VARIANT_LEVEL_KEY: [x["name"] for x in get_crx_def()],\n',
  '        GENE_LEVEL_KEY: [x["name"] for x in get_crg_def()],\n',
  '    }\n',
  '    required_conf_keys = ["level", "output_columns"]\n',
  '\n',
  '    def __init__(self, input_file: Optional[str]=None, secondary_inputs=None, run_name: Optional[str]=None, output_dir: Opt

In [38]:
__main__.__file__ = "test.py"

NameError: name '__main__' is not defined

In [26]:
class A:
    pass
print(type(A))

<class 'type'>


In [14]:
import inspect
inspect.getsource(m)

TypeError: module, class, method, function, traceback, frame, or code object was expected, got Annotator

In [6]:
c=ov.get_live_annotator(module_name="clinvar")

@ conf={'datasource': '2022.06.14', 'description': 'ClinVar is an archive of reports of the relationships among human variations and phenotypes, with supporting evidence.', 'developer': {'citation': 'Landrum, M.J., Lee, J.M., Benson, M., et al. (2016). ClinVar: public archive of interpretations of clinically relevant variants. Nucleic Acids Research, 44(D1):D862-8.', 'email': 'support@cravat.us', 'name': 'NCBI', 'organization': 'NCBI', 'website': 'https://www.ncbi.nlm.nih.gov/clinvar/'}, 'input_format': 'crx', 'level': 'variant', 'output_columns': [{'category': 'single', 'name': 'sig', 'title': 'Clinical Significance', 'type': 'string', 'width': 90, 'filterable': False, 'hidden': True}, {'hidden': True, 'name': 'disease_refs', 'title': 'Disease Ref Nums', 'type': 'string', 'width': 55, 'filterable': False, 'desc': 'Disease reference numbers'}, {'name': 'disease_names', 'title': 'Disease Names', 'type': 'string', 'width': 55, 'hidden': False}, {'desc': 'The level of review supporting th

In [8]:
c.annotate({"chrom": "chr11", "pos": 64810053, "ref_base": "-", "alt_base": "A"})

{'sig': 'Pathogenic',
 'disease_refs': 'MedGen:CN517202',
 'disease_names': 'not provided',
 'rev_stat': 'criteria provided, single submitter',
 'id': 200993,
 'sig_conf': None}