Skip to content

Commit

Permalink
Add table for precomputed statistics in DB. Remove hardcoded value.
Browse files Browse the repository at this point in the history
  • Loading branch information
krassowski committed Jul 18, 2017
1 parent 0160761 commit c64a125
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 112 deletions.
19 changes: 19 additions & 0 deletions website/manage.py
Expand Up @@ -36,6 +36,16 @@ def automigrate(args, app=None):
return True


def calc_statistics(args, app=None):
if not app:
app = create_app(config_override={'LOAD_STATS': False})
with app.app_context():
from statistics import Statistics
statistics = Statistics()
statistics.calc_all()
db.session.commit()


def get_all_models(module_name='bio'):
from models import Model
from sqlalchemy.ext.declarative.clsregistry import _ModuleMarker
Expand Down Expand Up @@ -397,6 +407,15 @@ def create_parser():

create_command_subparsers(command_subparsers)

# STATS SUBCOMMAND
stats_parser = subparsers.add_parser(
'calc_stats',
help=(
'should statistics (counts of protein, pathways, mutation, etc) be recalculated?'
)
)
stats_parser.set_defaults(func=calc_statistics)

# MIGRATE SUBCOMMAND
migrate_parser = subparsers.add_parser(
'migrate',
Expand Down
6 changes: 6 additions & 0 deletions website/models/cms.py
Expand Up @@ -20,6 +20,12 @@ class CMSModel(Model):
__bind_key__ = 'cms'


class Count(CMSModel):
"""Statistics holder"""
name = db.Column(db.String(254), unique=True)
value = db.Column(db.Integer)


class BadWord(CMSModel):
"""Model for words which should be filtered out"""

Expand Down
290 changes: 181 additions & 109 deletions website/statistics.py
@@ -1,152 +1,156 @@
from functools import lru_cache
from itertools import combinations

from database import db
from database import db, get_or_create
from database import fast_count
import models
from sqlalchemy import and_
from sqlalchemy import and_, distinct, func
from sqlalchemy import or_
from flask import current_app
from models import Mutation
from models import Mutation, Count

counters = {}

MAPPINGS_COUNT = 73093771 # this is result of stats.count_mappings() -
# due to long execution time it was precomputed once and hardcoded here

def counter(func, name=None):
if not name:
name = func.__name__
counters[name] = func
return lru_cache(maxsize=1)(func)


def models_counter(model):
def counter(self):
return self.count(model)
counter.to_be_registered = True
return counter


def mutations_counter(func):
return counter(func, name='mutations_' + func.__name__)


class Statistics:

@staticmethod
def count(model):
return db.session.query(model).count()
@property
def counters(self):
return counters

@staticmethod
def all_confirmed_mutations():
return Mutation.query.filter_by(
is_confirmed=True
def get_filter_by_sources(sources):

filters = and_(
(
(
Mutation.get_relationship(source).any()
if source.details_manager else
Mutation.get_relationship(source).has()
)
for source in sources

)
)

return filters

def count_by_source(self, sources):
return Mutation.query.filter(
self.get_filter_by_sources(sources)
).count()

def count_mutations(self, mutation_class):
if mutation_class.details_manager is not None:
return db.session.query(Mutation).filter(
self.get_filter_by_sources([mutation_class])
).count()
else:
return self.count(mutation_class)
def get_methods(self):

def get_all(self):
interactions, kinases_covered, groups_covered, proteins_covered = self.count_interactions()
def is_method(member):
name, value = member
return not name.startswith('_') and callable(value)

mutation_counts = {
all_members = {name: getattr(self, name) for name in dir(self)}

return filter(is_method, all_members.items())

def __init__(self):

for model in Mutation.source_specific_data:
# dirty trick: 1KGenomes is not valid name in python
model.name.replace('1', 'T'): self.count_mutations(model)
for model in Mutation.source_specific_data
}
name = 'mutations_' + model.name.replace('1', 'T')

annotation_counts = {
model.name + '_annotations': self.count(model)
for model in filter(
lambda model: model.details_manager,
Mutation.source_specific_data
)
}
def muts_counter(self):
return self.count_mutations(model)
muts_counter.to_be_registered = True

self.__dict__[name] = muts_counter

mutation_stats = {
# both confirmed and MIMP mutations
'all': self.count(Mutation),
'all_confirmed': self.all_confirmed_mutations(),
# 'from_many_sources' is very expensive, and it might be better
# to disable when not necessary (it will be useful for debugging
# purposes - so we can check if mutations count is correct)
# 'from_more_than_one_source': self.from_many_sources(),
'confirmed_in_ptm_sites': self.count_muts_in_sites(),
'confirmed_with_mimp': self.count_muts_with_mimp()
for model in filter(lambda model: model.details_manager, Mutation.source_specific_data):
name = 'mutations_' + model.name + '_annotations'

self.__dict__[name] = models_counter(model)

for name, method in self.get_methods():
if hasattr(method, 'to_be_registered'):
self.__dict__[name] = counter(method, name)

def calc_all(self):
for name, counter in self.counters.items():
model, new = get_or_create(Count, name=name)
if hasattr(counter, '__self__'):
value = counter()
else:
value = counter(self)
model.value = value
print(name, value)
if new:
db.session.add(model)

def get_all(self):

mutation_counts = {
counter_name[10:]: db.session.query(Count.value).filter(Count.name == counter_name).scalar() or 0
for counter_name in self.counters.keys()
if counter_name.startswith('mutations_')
}
mutation_stats.update(mutation_counts)
mutation_stats.update(annotation_counts)

return {
'proteins': self.count(models.Protein),
'genes': self.count(models.Gene),
'kinases': self.count(models.Kinase),
'kinase_groups': self.count(models.KinaseGroup),
'muts': mutation_stats,
'sites': self.count(models.Site),
'pathways': self.count(models.Pathway),
'cancer': self.count(models.Cancer),
# "number of mutation annotations
# (all DNA>protein table + MIMP annotations)"
'annotations': (
self.count(models.MIMPMutation) +
MAPPINGS_COUNT # self.count_mappings()
),
'interactions': interactions,
'kinases_covered': kinases_covered

counts = {
counter_name: db.session.query(Count.value).filter(Count.name == counter_name).scalar() or 0
for counter_name in self.counters.keys()
if not counter_name.startswith('mutations_')
}

@staticmethod
def count_interactions():
counts['muts'] = mutation_counts

kinases_covered = fast_count(db.session.query(models.Kinase).filter(models.Kinase.sites.any()))
kinase_groups_covered = fast_count(db.session.query(models.KinaseGroup).filter(models.KinaseGroup.sites.any()))
proteins_covered = len(
db.session.query(models.Site.protein_id)
.filter(or_(
models.Site.kinases.any(),
models.Site.kinase_groups.any()
))
.distinct().
all()
)
all_interactions = (
fast_count(db.session.query(models.Site).join(models.Kinase, models.Site.kinases)) +
fast_count(db.session.query(models.Site).join(models.KinaseGroup, models.Site.kinase_groups))
)
return counts

return all_interactions, kinases_covered, kinase_groups_covered, proteins_covered
@mutations_counter
def all(self):
"""Either confirmed or not."""
return self.count(Mutation)

@staticmethod
def count_mappings():
from database import bdb
return len(bdb)
@mutations_counter
def all_confirmed(self):
return Mutation.query.filter_by(
is_confirmed=True
).count()

@staticmethod
def count_muts_in_sites():
@mutations_counter
def confirmed_in_ptm_sites(self):
return Mutation.query.filter_by(
is_confirmed=True,
is_ptm_distal=True
).count()

def count_muts_with_mimp(self):
@mutations_counter
def confirmed_with_mimp(self):
return Mutation.query.filter(
and_(
self.get_filter_by_sources([models.MIMPMutation]),
Mutation.is_confirmed,
)
).count()

@staticmethod
def get_filter_by_sources(sources):

filters = and_(
(
(
Mutation.get_relationship(source).any()
if source.details_manager else
Mutation.get_relationship(source).has()
)
for source in sources

)
)

return filters

def count_by_source(self, sources):
return Mutation.query.filter(
self.get_filter_by_sources(sources)
).count()

def from_many_sources(self):
# 'from_more_than_one_source' is very expensive, and it might be better
# to disable when not necessary (it will be useful for debugging
# purposes - so we can check if mutations count is correct)
# @mutations_counter
def from_more_than_one_source(self):
"""Counts mutations that have annotations in more
than one source (eg. in both: TCGA and ClinVar).
"""
Expand All @@ -165,9 +169,77 @@ def from_many_sources(self):

return count

@staticmethod
def count(model):
return db.session.query(model).count()

def count_mutations(self, mutation_class):
if mutation_class.details_manager is not None:
return db.session.query(Mutation).filter(
self.get_filter_by_sources([mutation_class])
).count()
else:
return self.count(mutation_class)

@counter
def proteins(self):
return self.count(models.Protein)

genes = models_counter(models.Gene)
kinases = models_counter(models.Kinase)
kinase_groups = models_counter(models.KinaseGroup)
sites = models_counter(models.Site)
pathways = models_counter(models.Pathway)
cancer = models_counter(models.Cancer)

@counter
def mappings(self):
from database import bdb
return len(bdb)

@counter
def annotations(self):
# "number of mutation annotations
# (all DNA>protein table + MIMP annotations)"
return self.count(models.MIMPMutation) + self.mappings()

@counter
def kinases_covered(self):
return fast_count(db.session.query(models.Kinase).filter(models.Kinase.sites.any()))

@counter
def kinase_groups_covered(self):
return fast_count(db.session.query(models.KinaseGroup).filter(models.KinaseGroup.sites.any()))

@counter
def interactions(self):
return (
fast_count(db.session.query(models.Site).join(models.Kinase, models.Site.kinases)) +
fast_count(db.session.query(models.Site).join(models.KinaseGroup, models.Site.kinase_groups))
)

@counter
def proteins_covered(self):
return (
db.session.query(
func.count(
distinct(models.Site.protein_id)
)
)
.filter(
or_(
models.Site.kinases.any(),
models.Site.kinase_groups.any()
)
)
.scalar()
)


if current_app.config['LOAD_STATS']:
stats = Statistics()
print('Loading statistics')
STATISTICS = stats.get_all()
else:
print('Skipping loading statistics')
STATISTICS = ''

0 comments on commit c64a125

Please sign in to comment.