R/class-scores.R

setOldClass(c("tbl_df", "tbl", "data.frame"))

#' An S4 class to represent a set of PGS Catalog Polygenic Scores
#'
#' The scores object consists of six tables (slots) that combined form a
#' relational database of a subset of PGS Catalog polygenic scores. Each score
#' is an observation (row) in the \code{scores} table (the first table).
#'
#' @slot scores A table of polygenic scores. Each polygenic score (row) is
#'   uniquely identified by the \code{pgs_id} column. Columns:
#' \describe{
#' \item{pgs_id}{Polygenic Score (PGS) identifier. Example: \code{"PGS000001"}.}
#' \item{pgs_name}{This may be the name that the authors describe the PGS with
#' in the source publication, or a name that a curator of the PGS Catalog has
#' assigned to identify the score during the curation process (before a PGS
#' identifier has been given). Example: \code{PRS77_BC}.}
#' \item{scoring_file}{URL to the scoring file on the PGS FTP server. Example:
#' \code{"http://ftp.ebi.ac.uk/pub/databases/spot/pgs/scores/PGS000001/ScoringFiles/PGS000001.txt.gz"}.}
#' \item{matches_publication}{Indicate if the PGS data matches the published
#' polygenic score (\code{TRUE}). If not (\code{FALSE}), the authors have
#' provided an alternative polygenic for the Catalog and some other data, such
#' as performance metrics, may differ from the publication.}
#' \item{reported_trait}{The author-reported trait that the PGS has been
#' developed to predict. Example: \code{"Breast Cancer"}.}
#' \item{trait_additional_description}{Any additional description not captured
#' in the other columns. Example: \code{"Femoral neck BMD (g/cm2)"}.}
#' \item{pgs_method_name}{The name or description of the method or computational
#' algorithm used to develop the PGS.}
#' \item{pgs_method_params}{A description of the relevant inputs and parameters
#' relevant to the PGS development method/process.}
#' \item{n_variants}{Number of variants used to calculate the PGS.}
#' \item{n_variants_interactions}{Number of higher-order variant interactions
#' included in the PGS.}
#' \item{assembly}{The version of the genome assembly that the variants present
#' in the PGS are associated with. Example: \code{GRCh37}.}
#' \item{license}{The PGS Catalog distributes its data according to EBI's
#' standard Terms of Use. Some PGS have specific terms, licenses, or
#' restrictions (e.g. non-commercial use) that we highlight in this field, if
#' known.}
#' }
#' @slot publications A table of publications. Each publication (row) is
#'   uniquely identified by the \code{pgp_id} column. Columns:
#' \describe{
#' \item{pgs_id}{Polygenic Score (PGS) identifier.}
#' \item{pgp_id}{PGS Publication identifier. Example: \code{"PGP000001"}.}
#' \item{pubmed_id}{\href{https://en.wikipedia.org/wiki/PubMed}{PubMed}
#' identifier. Example: \code{"25855707"}.}
#' \item{publication_date}{Publication date. Example: \code{"2020-09-28"}. Note
#' that the class of \code{publication_date} is \code{\link[base]{Date}}.}
#' \item{publication}{Abbreviated name of the journal. Example: \code{"Am J Hum
#' Genet"}.}
#' \item{title}{Publication title.}
#' \item{author_fullname}{First author of the publication. Example:
#' \code{'Mavaddat N'}.}
#' \item{doi}{Digital Object Identifier (DOI). This variable is also curated to
#' allow unpublished work (e.g. preprints) to be added to the catalog. Example:
#' \code{"10.1093/jnci/djv036"}.}
#' }
#' @slot samples A table of samples. Each sample (row) is uniquely identified by
#'   the combination of values from the columns: \code{pgs_id} and
#'   \code{sample_id}. Columns:
#' \describe{
#' \item{pgs_id}{Polygenic score identifier. An identifier that starts with
#' \code{'PGS'} and is followed by six digits, e.g. \code{'PGS000001'}.}
#' \item{sample_id}{Sample identifier. This is a surrogate key to identify each sample.}
#' \item{stage}{Sample stage: either \code{"discovery"} or \code{"training"}.}
#' \item{sample_size}{Number of individuals included in the sample.}
#' \item{sample_cases}{Number of cases.}
#' \item{sample_controls}{Number of controls.}
#' \item{sample_percent_male}{Percentage of male participants.}
#' \item{phenotype_description}{Detailed phenotype description.}
#' \item{ancestry_category}{Author reported ancestry is mapped to the best matching
#' ancestry category from the NHGRI-EBI GWAS Catalog framework (see
#' \code{\link[quincunx]{ancestry_categories}}) for possible values.}
#' \item{ancestry}{A more detailed description of sample ancestry
#' that usually matches the most specific description described by the authors
#' (e.g. French, Chinese).}
#' \item{country}{Author reported countries of recruitment (if available).}
#' \item{ancestry_additional_description}{Any additional description not
#' captured in the other columns (e.g. founder or genetically isolated
#' populations, or further description of admixed samples).}
#' \item{study_id}{Associated GWAS Catalog study accession identifier, e.g.,
#' \code{"GCST002735"}.}
#' \item{pubmed_id}{\href{https://en.wikipedia.org/wiki/PubMed}{PubMed}
#' identifier.}
#' \item{cohorts_additional_description}{Any additional description about the
#' samples (e.g. sub-cohort information).}
#' }
#' @slot demographics A table of sample demographics' variables. Each
#'   demographics' variable (row) is uniquely identified by the combination of
#'   values from the columns: \code{pgs_id}, \code{sample_id} and
#'   \code{variable}. Columns:
#' \describe{
#' \item{pgs_id}{Polygenic Score (PGS) identifier.}
#' \item{sample_id}{Sample identifier. This is a surrogate identifier to
#' identify each sample.}
#' \item{variable}{Demographics variable. Following columns report about the
#' indicated variable.}
#' \item{estimate_type}{Type of statistical estimate for variable.}
#' \item{estimate}{The variable's statistical value.}
#' \item{unit}{Unit of the variable.}
#' \item{variability_type}{Measure of statistical dispersion for variable, e.g.
#' standard error (se) or standard deviation (sd).}
#' \item{variability}{The value of the measure of dispersion.}
#' \item{interval_type}{Type of statistical interval for variable: range, iqr
#' (interquartile), ci (confidence interval).}
#' \item{interval_lower}{Interval lower bound.}
#' \item{interval_upper}{Interval upper bound.}
#' }
#' @slot cohorts A table of cohorts. Each cohort (row) is uniquely identified by
#'   the combination of values from the columns: \code{pgs_id}, \code{sample_id}
#'   and \code{cohort_symbol}. Columns:
#' \describe{
#' \item{pgs_id}{Polygenic Score (PGS) identifier.}
#' \item{sample_id}{Sample identifier. This is a surrogate key to identify each sample.}
#' \item{cohort_symbol}{Cohort symbol.}
#' \item{cohort_name}{Cohort full name.}
#' }
#' @slot traits A table of EFO traits. Each trait (row) is uniquely identified
#'   by the combination of the columns \code{pgs_id} and \code{efo_id}. Columns:
#' \describe{
#' \item{pgs_id}{Polygenic Score (PGS) identifier.}
#' \item{efo_id}{An \href{https://www.ebi.ac.uk/efo/}{EFO} identifier.}
#' \item{trait}{Trait name.}
#' \item{description}{Detailed description of the trait from EFO.}
#' \item{url}{External link to the EFO entry.}
#' }
#' @slot stages_tally A table of sample sizes and number of samples sets at each stage.
#' \describe{
#' \item{pgs_id}{Polygenic Score (PGS) identifier.}
#' \item{stage}{Sample stage: either \code{"gwas"}, \code{"dev"} or \code{"eval"}.}
#' \item{sample_size}{Sample size.}
#' \item{n_sample_sets}{Number of sample sets (only meaningful for the evaluation stage \code{"eval"})}
#' }
#' @slot ancestry_frequencies This table describes the ancestry composition at each stage.
#' \describe{
#' \item{pgs_id}{Polygenic Score (PGS) identifier.}
#' \item{stage}{Sample stage: either \code{"gwas"}, \code{"dev"} or \code{"eval"}.}
#' \item{ancestry_class_symbol}{Ancestry class symbol.}
#' \item{frequency}{Ancestry fraction (percentage).}
#' }
#' @slot multi_ancestry_composition A table of a breakdown of the ancestries included in multi-ancestries.
#' \describe{
#' \item{pgs_id}{Polygenic Score (PGS) identifier.}
#' \item{stage}{Sample stage: either \code{"gwas"}, \code{"dev"} or \code{"eval"}.}
#' \item{multi_ancestry_class_symbol}{Multi-ancestry class symbol.}
#' \item{ancestry_class_symbol}{Ancestry class symbol.}
#' }
#' @export
setClass(
  "scores",
  slots = c(
    scores = "tbl_df",
    publications = "tbl_df",
    samples = "tbl_df",
    demographics = "tbl_df",
    cohorts = "tbl_df",
    traits = "tbl_df",
    stages_tally = "tbl_df",
    ancestry_frequencies = "tbl_df",
    multi_ancestry_composition = "tbl_df"
  )
)

#' Constructor for the S4 scores object.
#'
#' Constructor for the S4 \linkS4class{scores} object.
#'
#' @param scores A \code{s4scores_scores_tbl} tibble.
#' @param publications A \code{s4scores_publications_tbl} tibble.
#' @param samples A \code{s4scores_samples_tbl} tibble.
#' @param demographics A \code{s4scores_demographics_tbl} tibble.
#' @param cohorts A \code{s4scores_cohorts_tbl} tibble.
#' @param traits A \code{s4scores_traits_tbl} tibble.
#'
#' @return An object of class \linkS4class{scores}.
#' @keywords internal
scores <-
  function(scores = s4scores_scores_tbl(),
           publications = s4scores_publications_tbl(),
           samples = s4scores_samples_tbl(),
           demographics = s4scores_demographics_tbl(),
           cohorts = s4scores_cohorts_tbl(),
           traits = s4scores_traits_tbl(),
           stages_tally = s4scores_stages_tally_tbl(),
           ancestry_frequencies = s4scores_ancestry_frequencies_tbl(),
           multi_ancestry_composition = s4scores_multi_ancestry_composition_tbl()) {

    s4_scores <- methods::new(
      "scores",
      scores = scores,
      publications = publications,
      samples = samples,
      demographics = demographics,
      cohorts = cohorts,
      traits = traits,
      stages_tally = stages_tally,
      ancestry_frequencies = ancestry_frequencies,
      multi_ancestry_composition = multi_ancestry_composition
    )

  return(s4_scores)
}

s4scores_scores_tbl <- function(pgs_id = character(),
                                pgs_name = character(),
                                scoring_file = character(),
                                matches_publication = logical(),
                                reported_trait = character(),
                                trait_additional_description = character(),
                                pgs_method_name = character(),
                                pgs_method_params = character(),
                                n_variants = integer(),
                                n_variant_interactions = integer(),
                                assembly = character(),
                                license = character()) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    pgs_name = pgs_name,
    scoring_file = scoring_file,
    matches_publication = matches_publication,
    reported_trait = reported_trait,
    trait_additional_description = trait_additional_description,
    pgs_method_name = pgs_method_name,
    pgs_method_params = pgs_method_params,
    n_variants = n_variants,
    n_variant_interactions = n_variant_interactions,
    assembly = assembly,
    license = license
  )

  return(tbl)
}

s4scores_publications_tbl <- function(
  pgs_id = character(),
  pgp_id = character(),
  pubmed_id = character(),
  publication_date = lubridate::ymd(),
  publication = character(),
  title = character(),
  author_fullname = character(),
  doi = character()
) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    pgp_id = pgp_id,
    pubmed_id = pubmed_id,
    publication_date = publication_date,
    publication = publication,
    title = title,
    author_fullname = author_fullname,
    doi = doi
  )

  return(tbl)
}

s4scores_samples_tbl <- function(
  pgs_id = character(),
  sample_id = integer(),
  stage = character(),
  sample_size = integer(),
  sample_cases = integer(),
  sample_controls = integer(),
  sample_percent_male = double(),
  phenotype_description = character(),
  ancestry_category = character(),
  ancestry = character(),
  country = character(),
  ancestry_additional_description = character(),
  study_id = character(),
  pubmed_id = character(),
  cohorts_additional_description = character()
) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    sample_id = sample_id,
    stage = stage,
    sample_size = sample_size,
    sample_cases = sample_cases,
    sample_controls = sample_controls,
    sample_percent_male = sample_percent_male,
    phenotype_description = phenotype_description,
    ancestry_category = ancestry_category,
    ancestry = ancestry,
    country = country,
    ancestry_additional_description = ancestry_additional_description,
    study_id = study_id,
    pubmed_id = pubmed_id,
    cohorts_additional_description = cohorts_additional_description
  )

  return(tbl)
}

s4scores_demographics_tbl <- function(
  pgs_id = character(),
  sample_id = integer(),
  estimate_type = character(),
  estimate = double(),
  interval_type = character(),
  interval_lower = double(),
  interval_upper = double(),
  variability_type = character(),
  variability = double(),
  unit = character()
) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    sample_id = sample_id,
    estimate_type = estimate_type,
    estimate = estimate,
    interval_type = interval_type,
    interval_lower = interval_lower,
    interval_upper = interval_upper,
    variability_type = variability_type,
    variability = variability,
    unit = unit
  )

  return(tbl)

}

s4scores_cohorts_tbl <- function(
  pgs_id = character(),
  sample_id = integer(),
  cohort_symbol = character(),
  cohort_name = character()
) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    sample_id = sample_id,
    cohort_symbol = cohort_symbol,
    cohort_name = cohort_name
  )

  return(tbl)

}

s4scores_traits_tbl <- function(
  pgs_id = character(),
  efo_id = character(),
  trait = character(),
  description = character(),
  url = character()
) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    efo_id = efo_id,
    trait = trait,
    description = description,
    url = url
  )
  return(tbl)
}

s4scores_stages_tally_tbl <- function(
  pgs_id = character(),
  stage = character(),
  sample_size = integer(),
  n_sample_sets = integer()
) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    stage = stage,
    sample_size = sample_size,
    n_sample_sets = n_sample_sets
  )
  return(tbl)
}

s4scores_ancestry_frequencies_tbl <- function(
  pgs_id = character(),
  stage = character(),
  ancestry_class_symbol = character(),
  frequency = double()
) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    stage = stage,
    ancestry_class_symbol = ancestry_class_symbol,
    frequency = frequency
  )
  return(tbl)
}

s4scores_multi_ancestry_composition_tbl <- function(
  pgs_id = character(),
  stage = character(),
  multi_ancestry_class_symbol = character(),
  ancestry_class_symbol = character()
) {

  tbl <- tibble::tibble(
    pgs_id = pgs_id,
    stage = stage,
    multi_ancestry_class_symbol = multi_ancestry_class_symbol,
    ancestry_class_symbol = ancestry_class_symbol
  )
  return(tbl)
}

coerce_to_s4_scores <- function(lst_tbl = NULL) {

  if (is.null(lst_tbl)) {
    s4_scores <- scores()
    return(s4_scores)
  }

    s4_scores <- scores(
      scores = lst_tbl$scores,
      publications = lst_tbl$publications,
      samples = lst_tbl$samples,
      demographics = lst_tbl$demographics,
      cohorts = lst_tbl$cohorts,
      traits = lst_tbl$traits,
      stages_tally = lst_tbl$stages_tally,
      ancestry_frequencies = lst_tbl$ancestry_frequencies,
      multi_ancestry_composition = lst_tbl$multi_ancestry_composition
    )

    s4_scores@scores <- drop_metadata_cols(s4_scores@scores)
    s4_scores@publications <- drop_metadata_cols(s4_scores@publications)
    s4_scores@samples <- drop_metadata_cols(s4_scores@samples)
    s4_scores@demographics <- drop_metadata_cols(s4_scores@demographics)
    s4_scores@cohorts <- drop_metadata_cols(s4_scores@cohorts)
    s4_scores@traits <- drop_metadata_cols(s4_scores@traits)
    s4_scores@stages_tally <- drop_metadata_cols(s4_scores@stages_tally)
    s4_scores@ancestry_frequencies <- drop_metadata_cols(s4_scores@ancestry_frequencies)
    s4_scores@multi_ancestry_composition <- drop_metadata_cols(s4_scores@multi_ancestry_composition)

  return(s4_scores)
}