diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..807ea25 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.Rproj.user +.Rhistory +.RData diff --git a/LICENSE b/LICENSE index 32879a0..6138526 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2015 Polynumeral +Copyright (c) 2015 Academia.edu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md new file mode 100644 index 0000000..6090681 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +# Academia Citation Advantage Analysis + +The `acadcites` package contains the data and functions used in Niyazov, et. al. "Open Access Meets Discoverability: Citations to Articles Posted to Academia.edu." + +## Installing the R Package +The easiest way to install the package and its depdendencies is by using `install_local` from the `devtools` package. (http://cran.r-project.org/web/packages/devtools/index.html) + +- Clone the repo: + + ```{R} + git clone https://github.com/polynumeral/academia-citations + cd academia-citations + ``` + +- From R: + + ```{R} + install.packages('devtools') + devtools::install_local('acadcites_0.1.tar.gz') + ``` + +## Importing data +The cleaned/combined dataset used for the analyses can be obtained by calling: + +```{R} +library('acadcites') +cites <- importData() +``` + +or just `cites <- acadcites::importData()` without the `library` import. + +## Reproducing tables from the article + +Tables from the article can be reproduced with the `makeTable` function. + +```{R} +# Make Table 1 from the article. +makeTable(2, cites) + +# |Journal | # Articles| % Total| +# |:------------------------------------------------------|----------:|-------:| +# |Analytical Chemistry | 1,537| 3.44%| +# |PLoS One | 492| 1.10%| +# |Anesthesia and Analgesia | 430| 0.96%| +# |Biological and Pharmaceutical Bulletin | 362| 0.81%| +# |Analytical Methods: advancing methods and applications | 339| 0.76%| +# |Analytical Biochemistry | 317| 0.71%| +# |Applied Mechanics and Materials | 303| 0.68%| +# |Bioconjugate Chemistry | 299| 0.67%| +# |Applied Physics Letters | 190| 0.43%| +# |BioEssays | 183| 0.41%| +``` + + +## Reproducing figures from the article +The `makeFigure` function reproduces figures from the article. Like `makeTable`, +it takes a figure number and a citations data frame. + +```{r} +makeFigure(1, cites) +``` + + +## Package help + +See `help(package='acadcites')` for more help files on individual functions, or +`vignette('acadcites')` for information similar to what's provided here. diff --git a/acadcites/.Rbuildignore b/acadcites/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/acadcites/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/acadcites/.gitignore b/acadcites/.gitignore new file mode 100644 index 0000000..09a72cb --- /dev/null +++ b/acadcites/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +inst/doc diff --git a/acadcites/DESCRIPTION b/acadcites/DESCRIPTION new file mode 100644 index 0000000..eec1ac3 --- /dev/null +++ b/acadcites/DESCRIPTION @@ -0,0 +1,24 @@ +Package: acadcites +Title: Manage data and models to study effect of Academia.edu on citations. +Version: 0.1 +Authors@R: "Carl Vogel [aut, cre]" +Description: Manage data and run models to study the effect of posting to + Academia.edu on article citations. +Depends: + R (>= 3.1.1) +License: MIT +LazyData: true +Imports: + dplyr, + magrittr, + stringr, + reshape2, + ggplot2, + MASS, + pscl, + knitr, + scales, + stargazer, + memisc, + pander +VignetteBuilder: knitr diff --git a/acadcites/NAMESPACE b/acadcites/NAMESPACE new file mode 100644 index 0000000..bb0a5a0 --- /dev/null +++ b/acadcites/NAMESPACE @@ -0,0 +1,29 @@ +exportPattern("^[^\\.]") +import(dplyr) +importFrom(stringr, str_detect) +importFrom(stringr, str_trim) +importFrom(stringr, str_replace_all) +importFrom(stringr, str_replace) +importFrom(magrittr, use_series) +importFrom(magrittr, set_colnames) +importFrom(magrittr, set_names) +importFrom(magrittr, extract) +importFrom(ggplot2, ggplot) +importFrom(ggplot2, aes) +importFrom(ggplot2, geom_boxplot) +importFrom(ggplot2, geom_histogram) +importFrom(ggplot2, geom_point) +importFrom(ggplot2, stat_quantile) +importFrom(ggplot2, facet_wrap) +importFrom(ggplot2, labs) +importFrom(ggplot2, xlim) +importFrom(ggplot2, scale_x_continuous) +importFrom(ggplot2, scale_y_continuous) +importFrom(ggplot2, scale_colour_manual) +importFrom(ggplot2, position_jitter) +importFrom(ggplot2, theme_bw) +importFrom(memisc, mtable) +importFrom(memisc, relabel) +S3method(getModelSummary, lm) +S3method(getModelSummary, glm) +S3method(getModelSummary, zeroinfl) diff --git a/acadcites/R/bucket_analysis.R b/acadcites/R/bucket_analysis.R new file mode 100644 index 0000000..4e3902a --- /dev/null +++ b/acadcites/R/bucket_analysis.R @@ -0,0 +1,108 @@ +# Functions for comparing citations within Impact Factor buckets + +#' Group a variable into buckets based on its quantiles or those of another +#' variable. +#' +#' @param x_quantile The variable to calculate quantile buckets from. +#' @param x_bucket The variable to collect into the quantile buckets. +#' @param nbuckets The number of quantile buckets to use. Specify this *or* +#' `probs`, but not both. +#' @param probs The vector of probabilities for the quantile bucket cut points. +#' Specify this *or* `nbuckets`, but not both. +#' @return A factor vector corresponding to `x_bucket` with bucket ranges. +#' If an element of `x_bucket` is outside of the range of `x_quantile`, its +#' bucket will be NA. +#' +quantileBuckets <- function(x_quantile, x_bucket=x_quantile, nbuckets=10, probs=NULL) { + if (!is.null(nbuckets) & is.null(probs)) { + breaks <- quantile(x_quantile, probs=0:nbuckets / nbuckets) + cut(x_bucket, breaks, include.lowest=TRUE) + } else if (!is.null(probs) & is.null(nbuckets)) { + cut(x_bucket, quantile(x_quantile, probs), include.lowest=TRUE) + } else { + stop('Only specify nbuckets or probs, not both.') + } +} + + +#' Compare on- and off-Academia citations within years and quantile groups +#' of journal impact factors. +#' +#' @param cites_df A dataframe with citations and impact factors. +#' @param summarizer (default mean) A function to summarize citations within groups. +#' @param comparator (default `/`) A function with arguments (on, off), that compares +#' on and off-Academia citation summaries. The default computes the on/off ratio. +#' @param ... Extra parameters to `quantileBuckets` +#' @return A dataframe with statistic by year, impact factor group, and on/off-source. +compareByImpactFactorBuckets <- function(cites_df, summarizer=mean, + comparator=`/`, ...) { + + # Find buckets based on distribution of on-Academia citations. + bucketFactors <- function(x) { + on_factors <- cites_df %>% filter(source=='on') %>% use_series(impact_factor) + quantileBuckets(on_factors, x, ...) + } + + cites_df %>% + mutate(if_bucket = bucketFactors(impact_factor)) %>% + filter(!is.na(if_bucket)) %>% + group_by(if_bucket, year, source) %>% + summarize(cites=summarizer(citations)) %>% + reshape2::dcast(year + if_bucket ~ source, value.var='cites') %>% + mutate(comparison = comparator(on, off)) +} + +#' Average results over buckets, weighting by the number of on-Academia +#' articles in the bucket. +#' +#' @param cites_df A dataframe of citations with impact factors. +#' @return A dataframe of weighted average results by year. +#' +summarizeOverBuckets <- function(cites_df) { + cite_ratios <- compareByImpactFactorBuckets(cites_df, + summarizer=mean, + comparator=`/`) + counts <- compareByImpactFactorBuckets(cites_df, + summarizer=length, + comparator=`+`) + + weights <- counts %>% + group_by(year) %>% + mutate(weight = on / sum(on)) %>% + ungroup %>% + select(year, if_bucket, weight) + + cite_ratios %>% left_join(., weights, by=c('year', 'if_bucket')) %>% + group_by(year) %>% summarize(wtd_avg = sum(weight * comparison)) + +} + + +#' Boxplots on- and off-Academia citations within years and quantile groups +#' of journal impact factors. +#' +#' @param cites_df A dataframe with citations and impact factors. +#' @param ... Extra parameters to `quantileBuckets` +#' +#' @return A ggplot2 plot. +plotByImpactFactorBuckets <- function(cites_df, ...) { + + # Find buckets based on distribution of on-Academia citations. + bucketFactors <- function(x) { + on_factors <- cites_df %>% filter(source=='on') %>% use_series(impact_factor) + quantileBuckets(on_factors, x, ...) + } + + # Add bucket variable to data + df <- cites_df %>% + mutate(if_bucket = bucketFactors(impact_factor)) %>% + filter(!is.na(if_bucket)) + + p <- ggplot(df, aes(x=factor(year), y=citations, color=source)) + + geom_boxplot() + + facet_wrap(~if_bucket, ncol=2) + + labs(x='Year', y='Citations (log scale)', + title='Citations of On- and Off-Academia Articles By Year and Journal Impact Factor') + + theme_bw() + plotLogScale(p, xy='y') +} diff --git a/acadcites/R/figures.R b/acadcites/R/figures.R new file mode 100644 index 0000000..9b0ad8b --- /dev/null +++ b/acadcites/R/figures.R @@ -0,0 +1,61 @@ +# Reproduce figures +# +# Figures: +# -------- +# 1. Histograms over citations counts by off-/on-Academia +# 2. Citations boxplots by impact factor bucket and year of publication +# 3. Scatterplot of cites against impact factor +# 4. Scatterplot of cites against impact factor by off-/on-Academia and year. + +## Function names that produce figures, listed in order +## of their appearance in the paper. +.figures_functions = list( + 'plotCiteDistributions', + 'plotByImpactFactorBuckets', + 'plotCitesImpactFactorScatter', + 'plotImpactFactorMedReg') + + +#' Function to generate figures from the paper. +#' +#' Recreate a figure with a given citations dataset by specifying the table's +#' caption number in the paper. +#' +#' @param n Figure caption number. +#' @param cites_df A data frame with article citations and journal data, as produced by `importData`. +#' @param ... Optional arguments passed to figure functions. +#' +#' @return Nothing. Renders a plot. +#' +makeFigure <- function(n, cites_df, ...) { + eval(parse(text=.figures_functions[[n]]))(cites_df, ...) +} + + +plotCiteDistributions <- function(cites_df) { + ggplot(cites_df, aes(x=citations)) + + geom_histogram(binwidth=1, fill='steelblue', color='white') + + xlim(0, 100) + + facet_wrap(~source, scales='free_y') + + theme_bw() +} + +plotCitesImpactFactorScatter <- function(cites_df) { + p <- ggplot(cites_df, aes(x=impact_factor, y=citations)) + + geom_point(position=position_jitter(height=.1, width=.01), alpha=.3, size=.75) + + geom_smooth(method='lm') + + theme_bw() + + labs(x='Impact Factor (log scale)', y='Citations (log scale)') + plotLogScale(p, c('x', 'y')) +} + +plotImpactFactorMedReg <- function(cites_df) { + p <- ggplot(cites_df, aes(x=impact_factor, y=citations, color=source)) + + geom_point(position=position_jitter(height=.1, width=.01), alpha=.3, size=.75) + + facet_wrap(~year, ncol=2) + + stat_quantile(quantiles=0.5) + + scale_colour_manual(values=c('orange', 'purple')) + + labs(x='Impact Factor (log scale)', y='Citations (log scale)') + + theme_bw() + plotLogScale(p, c('x', 'y')) +} diff --git a/acadcites/R/import_data.R b/acadcites/R/import_data.R new file mode 100644 index 0000000..2eb0c81 --- /dev/null +++ b/acadcites/R/import_data.R @@ -0,0 +1,114 @@ +# Functions for importing and combining citation and journal data. + +#' Import and combine paper citations, paper downloads, and journal information. +#' +#' @param cites_path A path to a file with paper citation data for papers both +#' on and off Academia, +#' @param journals_path A path to a file with data on academic journals. +#' @param ... Other arguments to import and join functions. +#' +#' @return A dataframe of citation, download, and journal data merged together. +#' +importData <- function(cites_path=NULL, + journals_path=NULL, + ...) { + + if (is.null(cites_path)) { + cites_path <- system.file('extdata', 'papers.csv.gz', package='acadcites') + } + if (is.null(journals_path)) { + journals_path <- system.file('extdata', 'era_2012_journal_list.csv.gz', package='acadcites') + } + cites <- importCitesFile(cites_path, ...) + journals <- importEraFile(journals_path, ...) + joinCitesJournals(cites, journals, ...) +} + + +#' Estimate the age of a paper in years. +#' +#' Since we only have data on publication year, we approximate an article's +#' publication date as June 30 of its publication year. +#' +#' @param date_collected A Date vector of dates when the articles citations +#' were recorded. +#' @param published_year An integer vector of articles' publication years. +#' +#' @return A numeric vector of years between publication and citation +#' collection for each article. +#' +#' @examples +#' colln_dates = as.Date(c('2014-06-19', '2014-05-18', '2014-07-03')) +#' pub_years = c(2010, 2011, 2012) +#' paperAgeYears(colln_dates, pub_years) +#' +paperAgeYears <- function(date_collected, published_year) { + pubdate <- as.Date(paste0(published_year, '-06-30')) + as.numeric(date_collected - pubdate) / 365.25 +} + +# Import the article citations data. +importCitesFile <- function(cites_path, online_cutoff=0) { + read.csv(cites_path, header=TRUE, as.is=TRUE) %>% + mutate(source = ifelse(on_set == 't', 'on', 'off'), + on_acad = as.integer(source=='on'), + date_collected = as.Date(date_collected), + age = paperAgeYears(date_collected, year), + online = as.integer(avail_online_score > online_cutoff)) %>% + select(-on_set) +} + +# Import journal data from the Excellence in Research for Australia (ERA) +# initiative. +importEraFile <- function(era_path) { + df <- read.csv(era_path, colClasses='character', header=TRUE, as.is=TRUE) + names(df) <- c('era_id', 'journal_title', 'foreign_title', + 'for1', 'for1_name', 'for2', 'for2_name', + 'for3', 'for3_name', 'issn1', 'issn2', 'issn3', + 'issn4', 'issn5', 'issn6', 'issn7') + + df %>% mutate_each(funs(str_trim)) + issns <- df %>% select(issn1, issn2, issn3, issn4, issn5, issn6, issn7) %>% toSetStr + topics <- df %>% select(for1_name, for2_name, for3_name) %>% + mutate_each(funs(str_trim)) %>% + toSetStr + + # Get the first two digits of a string number. + # If the string is only length 1, pad w/ a leading 0. + # '12345' -> '12' + # '6' -> '06' + first2Dig <- function(s) { + s <- ifelse(nchar(s) == 1, paste0('0', s), s) + substring(str_trim(s), 1,2) + } + + # Divisions are the first two digits in the + # 'Field of Research' code + divisions <- df %>% + select(for1, for2, for3) %>% + mutate_each(funs(first2Dig)) %>% + toSetStr + + data.frame(journal_title=df$journal_title, + issns=issns, + topics=topics, + divisions=divisions, stringsAsFactors=FALSE) +} + + +## A journal is a single row, with multiple ISSN values. +## Melt to multiple rows, one for each ISSN (with other +## journal data duplicated). Need single, unique ISSN values +## in table to merge w/ citations. +meltJournalsOnIssn <- function(journals_df) { + meltOnSets(journals_df, 'issns', prefix='issn') +} + +# Join articles to their journals. +joinCitesJournals <- function(cites_df, journals_df) { + melted_journals <- meltJournalsOnIssn(journals_df) + left_join(cites_df, + melted_journals %>% select(journal_title, issn, topics, divisions), + by='issn') +} + diff --git a/acadcites/R/mtable_defns.R b/acadcites/R/mtable_defns.R new file mode 100644 index 0000000..e137906 --- /dev/null +++ b/acadcites/R/mtable_defns.R @@ -0,0 +1,60 @@ +# Replace definitions of mtable getSummary functions so they return +# a subset of coefficients. + +memisc::setCoefTemplate('stat.nostar'=c(est='($est:#)', stat='(($stat:#))')) + +getModelSummary <- function(obj, alpha=0.5, ...) { + UseMethod('getModelSummary') +} + +getModelSummary.zeroinfl <- function(obj, alpha=0.5, ...) { + hurdle_summary <- memisc::getSummary.hurdle(obj, alpha, ...) + varnames <- hurdle_summary %>% use_series(coef) %>% rownames + tokeep <- varnames[!str_detect(varnames, '(Div\\.)')] + hurdle_summary$coef <- + hurdle_summary %>% + use_series(coef) %>% + extract(tokeep, , ) + + hurdle_summary +} + +getModelSummary.glm <- function(obj, alpha=0.5, ...) { + coef_tbl <- obj %>% summary %>% coef + varnames <- rownames(coef_tbl) + tokeep <- varnames[!str_detect(varnames, 'Div\\.')] + coefs <- cbind(coef_tbl[tokeep, ], c(NA), c(NA)) + colnames(coefs) <- c('est', 'se', 'stat', 'p', 'lwr', 'upr') + list(coef=coefs, + sumstat=c(N=length(obj$fitted.values), + AIC=AIC(obj), + logLik=obj$twologlik/2, + deviance=deviance(obj))) +} + +getModelSummary.lm <- function(obj, alpha=0.5, ...) { + coef_tbl <- obj %>% summary %>% coef + varnames <- rownames(coef_tbl) + tokeep <- varnames[!str_detect(varnames, 'Div\\.')] + coefs <- cbind(coef_tbl[tokeep, ], c(NA), c(NA)) + colnames(coefs) <- c('est', 'se', 'stat', 'p', 'lwr', 'upr') + list(coefs=coefs, + sumstat=c(N=length(obj$fitted.values), + r.squared=summary(obj)$r.squared, + deviance=deviance(obj), + AIC=AIC(obj), + logLik=as.vector(logLik(obj)))) +} + +.cleanRegressionTable <- function(table) { + relabel(table, + `on_acad`='On-Academia', + `scale(log1p(impact_factor), scale = FALSE)`='Impact factor (log, centered)', + `scale(age, scale = FALSE)`='Article age (centered)', + `scale(I(age^2), scale = FALSE)` = 'Article age squared (centered)', + `online`='Available online', + `on_acad x scale(log1p(impact_factor), scale = FALSE)` = 'On-Academia \u00D7 Impact factor', + `on_acad x scale(age, scale = FALSE)` = 'On-Academia \u00D7 Age', + `on_acad x scale(I(age^2), scale = FALSE)`= 'On-Academia \u00D7 Age Squared', + `on_acad x online`='On-Academia \u00D7 Available online') +} diff --git a/acadcites/R/plot_utils.R b/acadcites/R/plot_utils.R new file mode 100644 index 0000000..6105370 --- /dev/null +++ b/acadcites/R/plot_utils.R @@ -0,0 +1,52 @@ +# Get the upper and lower range of a vector of data. +# Rounding the limits to the nearest hundredth +# (or less if the data is less than 100) +getRange <- function(v) { + maxv <- max(v) + maxpow10 <- maxv %>% + log10 %>% + floor %>% + min(., 2) %>% + (function(x) { 10^x }) + + ceil <- ceiling(maxv / maxpow10) * maxpow10 + c(0, ceil) +} + +# Make tickmarks from the range (min, max) of the data +getTicks <- function(range) { + minv <- range[1]; maxv <- range[2] + + ticks <- c(0, 5, 10, 50, 100, 200, 500, 1000) + ticks <- ticks[ticks >= minv && ticks <= maxv] + if (maxv > 1000) ticks <- c(ticks, seq(2000, max(maxv, 2000), by=1000)) + + ticks +} + +# Make a continuous log+1 scale for the x or y axis +# with tickmarks specified from the data. +makeScale <- function(plot, axis=c('x', 'y')) { + ticks <- plot$mapping[[axis]] %>% + as.character %>% + `[[`(plot$data, .) %>% + getRange %>% + getTicks + + switch(axis, + 'x' = scale_x_continuous(trans='log1p', breaks=ticks), + 'y' = scale_y_continuous(trans='log1p', breaks=ticks)) +} + +#' Scale a plot's axis or axes using a x->log(1+x) transformation. +#' +#' @param plot A ggplot2 plot with a defined aesthetic mapping +#' @param xy Axes to scale: 'x', 'y' or c('x', 'y') +#' @return A new ggplot2 plot with scaled axes. The axis labels +#' will still reflect the raw data. +plotLogScale <- function(plot, xy=c('x', 'y', c('x', 'y'))) { + # Add scales to plot + if ('x' %in% xy) plot <- plot + makeScale(plot, 'x') + if ('y' %in% xy) plot <- plot + makeScale(plot, 'y') + plot +} diff --git a/acadcites/R/research_divisions.R b/acadcites/R/research_divisions.R new file mode 100644 index 0000000..67158c3 --- /dev/null +++ b/acadcites/R/research_divisions.R @@ -0,0 +1,76 @@ +# ANZSRC Research 'Division' labels used in ERA data. +# See: # https://researchservices.anu.edu.au/ore/data/reference/for-codes.php +.divisionNames <- function() { + list( + '01' = 'Mathematical Sciences', + '02' = 'Physical Sciences', + '03' = 'Chemical Sciences', + '04' = 'Earth Sciencess', + '05' = 'Environmental Sciences', + '06' = 'Biological Sciences', + '07' = 'Agricultural and Veterinary Sciences', + '08' = 'Information and Computing Sciences', + '09' = 'Engineering', + '10' = 'Technology', + '11' = 'Medical and Health Sciences', + '12' = 'Built Environment and Design', + '13' = 'Education', + '14' = 'Economics', + '15' = 'Commerce, Management, Tourism and Services', + '16' = 'Studies in Human Society', + '17' = 'Psychology and Cognitive Sciences', + '18' = 'Law and Legal Studies', + '19' = 'Creative Arts and Writing', + '20' = 'Language, Communication and Culture', + '21' = 'History and Archaeology', + '22' = 'Philosophy and Religious Studies', + 'MD' = 'Multidisciplinary') +} + +# Identify the unique division labels from a list of divisions. +getUniqueDivisions <- function(div_list) { + div_list %>% do.call(c, .) %>% (function(x) x[!is.na(x)]) %>% unique %>% sort +} + +# Find whether a specific division code occurs in a string of multiple +# division codes. Returns 0/1. +inDivision <- function(divisions, division_code) { + str_detect(divisions, paste0('\\b', division_code, '\\b')) %>% + as.integer +} + +#' Append columns of research division dummy variables to the citations +#' data frame. +#' +#' One column is added for each division represented by any article in +#' the citations data frame. The columns have 0/1 values, indicating which +#' articles are in that research division. +#' +#' @param cites_df The citations data frame. +#' +#' @return The original citations dataframe with the division columns added. +#' +addDivisions <- function(cites_df) { + divnames <- .divisionNames() + + divvecs <- divnames %>% + names %>% + as.list %>% + lapply(function(x) { inDivision(cites_df$divisions, x)}) %>% + do.call(cbind, .) %>% + set_colnames(paste0('Div. ', divnames %>% unlist)) %>% + cbind(cites_df) + +} + +#' Select the FoR division data columns from a data frame +#' +#' @param df A dataframe with division columns. +#' @param pattern (default "^div\\.") A regex pattern +#' for detecting division column names. +#' +#' @return A dataframe of only the division columns. +#' +selectDivisionCols <- function(df, pattern="Div\\.") { + df[ , str_detect(names(df), pattern)] +} diff --git a/acadcites/R/run_r_models.R b/acadcites/R/run_r_models.R new file mode 100644 index 0000000..aa5f74f --- /dev/null +++ b/acadcites/R/run_r_models.R @@ -0,0 +1,130 @@ +#' Create a formula describing a regression of citations on covariates. +#' +#' Creates a formula of the form `y ~ ...` where `y` is the citation +#' count variable, possibly transformed, and `...` are covariates and their +#' interactions. +#' +#' The linear model log-transforms the citations variable, while the negative +#' binomial and ZINB models do not. The logistic model transforms citations +#' into a 0/1 variable where 1 indicates the article received at least one +#' citation. +#' +#' @param df A dataframe with citations data, and divisions as created by +#' `importData` followed by `addDivisions` +#' @param model One of 'linear', 'logistic', 'negbin', or 'zinb', the last for +#' zero-inflated negative binomial model. +#' @param division_interactions If TRUE (default), interact each division dummy +#' with the on-Academia dummy. +#' +#' @return An R formula. +#' +makeFormula <- function(df, + model=c('linear', 'logistic', 'negbin', 'zinb'), + division_interactions=TRUE) { + + division_terms <- df %>% + selectDivisionCols %>% + names %>% + (function(x) if (length(x) == 0) { stop('No divisions found.') } else { x }) %>% + (function(x) { paste0('`', x, '`')}) %>% + paste(collapse=' + ') + + rhs <- if (division_interactions) { + paste0('on_acad * (scale(log1p(impact_factor), scale=FALSE) + + scale(age, scale=FALSE) + + scale(I(age^2), scale=FALSE) + + online + ', + division_terms, ')') + } else { + # Leave out on_acad * division interatctions. + paste0('on_acad * (scale(log1p(impact_factor), scale=FALSE) + + scale(age, scale=FALSE) + + scale(I(age^2), scale=FALSE) + + online) + ', + division_terms) + } + + model = match.arg(model) + lhs = switch(model, + 'linear' = 'log1p(citations)', + 'logistic' = 'I(citations <= 0)', + 'negbin' = 'citations', + 'zinb' = 'citations', + # Default to raw citation counts. + 'citations') + formula(paste(lhs, '~', rhs)) +} + + +#' Fit a linear model of citations using lm. +#' +#' @param df A dataframe with citations data, and +#' divisions as created by `importData` followed by +#' `addDivisions` +#' +#' @param ... Extra arguments to `lm` +#' +#' @return An `lm` object. +#' +runLinearModel <- function(df, ...) { + frm <- makeFormula(df, model='linear', ...) + lm(frm, data=df, ...) +} + + +#' Fit a Logistic model of citations using glm. +#' +#' @param df A dataframe with citations data, and +#' divisions as created by `importData` followed by +#' `addDivisions` +#' @param ... Extra arguments to `glm`. +#' +#' @return A fitted GLM object. +#' +runLogisticModel <- function(df, ...) { + frm <- makeFormula(df, model='logistic', ...) + glm(frm, df, family=binomial(link=logit)) +} + + +#' Fit a Negative Binomial model of citations using glm. +#' +#' @param df A dataframe with citations data, and +#' keyword factors as created by `importData` followed by +#' `addDivisions` +#' @param ... Extra arguments to `MASS::glm.nb` +#' +#' @return A fitted GLM object. +#' +runNegBinModel <- function(df, ...) { + frm <- makeFormula(df, model='negbin', ...) + MASS::glm.nb(frm, df, link='log') +} + + +#' Fit a Zero Inflated Negative Binomial model of +#' citations. +#' +#' Uses `pscl::zeroinfl` to fit the model. For parameters initial guesses +#' we use the coefficients from a logistic regression +#' (for the 'zero' parameters) and from a negative binomial regression +#' (for the 'count' parameters). +#' +#' @param df A dataframe with citations data, and +#' divisions as created by `importData` followed by +#' `addDivisions` +#' @param ... Extra arguments to `pscl::zeroinfl` +#' +#' @return A fitted `zeroinfl` object. +#' +runZeroInflNegBinGLM <- function(df, ...) { + logit_coef <- runLogisticModel(df, division_interactions=FALSE, ...)$coef + negbin_coef <- runNegBinModel(df, division_interactions=FALSE, ...)$coef + + frm <- makeFormula(df, model='zinb', division_interactions=FALSE) + + starts = pscl::zeroinfl.control(start=list(count=negbin_coef, + zero=logit_coef)) + + pscl::zeroinfl(frm, df, dist='negbin', control=starts, ...) +} diff --git a/acadcites/R/set_melting.R b/acadcites/R/set_melting.R new file mode 100644 index 0000000..eadbef7 --- /dev/null +++ b/acadcites/R/set_melting.R @@ -0,0 +1,110 @@ +# Functions for dealing with string-set data. +# +# Many variables contain 'string' sets, which are lists attributes +# in a string of the form: '{sandwich, apple, "chocolate milk"}' +# +# These functions parse and manipulate the data w/in the string. + + +#' Combine columns of a dataframe into a string set representation. +#' +#' Note that the uniqueness of the set is not ensured in this +#' function. If a row of the input data frame has the same data +#' in multiple columns, the resulting set representation will +#' have non-unique elements. +#' +#' @param df A data frame whose columns will be combined into a set. +#' +#' @return A character vector of string representation forms. +#' +#' @examples +#' myDF <- data.frame(x=c(1,2,3), y=c('a', 'b', 'c')) +#' toSetStr(myDF) # {'1', 'a'} {'2', 'b'} {'3', 'c'} +#' +toSetStr <- function(df) { + df %>% + mutate_each(funs(sQuote)) %>% + do.call(function(...) paste(..., sep=','), .) %>% + # Nix all the curly quotes + str_replace_all('[\u2018\u2019]', '\'') %>% + str_replace_all('[\u201C\u201D]', '\"') %>% + str_replace_all("(,'NA')|(,'NA'$)|(,'')|(,''$)", '') %>% + (function(x) paste0('{', x, '}')) +} + + +#' Parse a vectors of string representations of sets to a list of vectors +#' +#' @param set_str A character vector of string representations of sets. +#' @param cast_to The name of a data type to cast the elements of the set to. +#' @param prefix A prefix to name columns in the data frame (default 'v'). +#' +#' @return A data frame with columns equal to the largest set's cardinality. +#' +#' @examples +#' setParse(c('{1, 2, 3}', '{2, 3}'), 'integer') +#' # v1 v2 v3 +#' # 1 1 2 3 +#' # 2 2 3 NA +#' setParse(c('{a, b}', '{}')) +#' # v1 v2 +#' # 1 a b +#' +setParse <- function(set_str, cast_to=NULL, prefix='v') { + set <- set_str %>% str_replace_all('^\\{|\\}$', '') + max_els <- max(set %>% textConnection %>% count.fields(sep=', ')) + + .unquote <- function(s) { s %>% str_replace_all('[\'\"]', '') %>% str_trim } + + set <- set %>% + textConnection %>% + read.csv(header=FALSE, colClasses='character', as.is=TRUE, encoding='utf-8', + quote="\"'", + col.names=paste0(prefix, 1:max_els)) %>% + mutate_each(funs(.unquote)) + + # Remove empty sets + set[set == ''] <- NA + + # Cast to data type + if (!is.null(cast_to)) set <- set %>% + sapply(function(s) { as(s, cast_to) }) %>% + as.data.frame + set +} + +dfToList <- function(df) { + as.data.frame(t(df), stringsAsFactors=FALSE) %>% + lapply(function(x) { x[!is.na(x)] }) %>% + set_names(NULL) +} + + +#' Repeat rows of a dataframe for each element in a set. +#' +#' @param df A dataframe +#' @param col A column in dataframe that is a character vector containing string +#' representations of sets. +#' @param prefix A string to name the column of set elements +#' @param ... Other arguments to pass to `setParse`. +#' +#' @return A 'molten' dataframe. +#' +#' @examples +#' mydf <- data.frame(x=c(1, 2), ids=c('{1,2,3}', '{5, 10}')) +#' meltOnSets(mydf, 'ids', prefix='id', cast_to='integer') +#' # x ids variable id +#' # 1 1 {1,2,3} id_1 1 +#' # 2 2 {5, 10} id_1 5 +#' # 3 1 {1,2,3} id_2 2 +#' # 4 2 {5, 10} id_2 10 +#' # 5 1 {1,2,3} id_3 3 +meltOnSets <- function(df, col, prefix, ...) { + set_df <- df[ , col] %>% setParse(., prefix=prefix, ...) + + cbind(df, set_df) %>% + reshape2::melt(na.rm=TRUE, + measure.vars=names(set_df), + value.name=prefix, ...) +} + diff --git a/acadcites/R/tables.R b/acadcites/R/tables.R new file mode 100644 index 0000000..4920800 --- /dev/null +++ b/acadcites/R/tables.R @@ -0,0 +1,489 @@ +## Functions to create tables in the paper + +## List of tables +## -------------- +## 1. Sample sizes by cohort and on-/off-Academia +## 2. Top journals by # articles in sample +## 3. Top impact factor journals and their topics +## 4. Divisions, sample %s, and impact factors +## 5. Share of sample articles available elsewhere online by on-/off-Academia +## 6. Citations Summary Statistics +## 7. Impact Factor bin table +## 8. Regression citations against impact factors +## 9. Summary Statistics for Regression Covariates +## 10. Regression results for Linear and Neg.Binomial models +## 11. Share of uncited articles +## 12. Regression results for ZINB model +## 13. Predicted Citation lookup tables +## 14. Predicted Advantage lookup tables +## 15. Predicted Advantage by Division. + +## Function names that produce tables, listed in order +## of their appearance in the paper. +.tables_functions = list( + 'sampleSizes', # Table 1 + 'topJournalsByCount', # Table 2 + 'topJournalsByImpactFactor', # Table 3 + 'divisionSharesCombined', # Table 4 + 'availableOnlineShares', # Table 5 + 'citationSummaryStats', # Table 6 + 'impactFactorBinsMedians', # Table 7 + 'citesImpactFactorRegression', # Table 8 + 'covariateSummaryStats', # Table 9 + 'regressionResults', # Table 10 + 'shareUncitedArticles', # Table 11 + 'zinbResults', # Table 12 + 'predictedCitations', # Table 13 + 'predictedAdvantages', # Table 14 + 'divisionPredictedCitations' # Table 15 + ) + + +#' Function to generate tables from the paper. +#' +#' Recreate a table, in markdown format with a given citations +#' dataset by specifying the table's caption number in the paper. +#' +#' @param n Table caption number. +#' @param cites_df A data frame with article citations and journal data, as produced by `importData`. +#' @param ... Optional arguments passed to table functions. +#' +#' @return Nothing. Prints a table, usually in markdown, but possibly plain text or LaTeX. +#' +makeTable <- function(n, cites_df, ...) { + eval(parse(text=.tables_functions[[n]]))(cites_df, ...) +} + + +# Table 1. Sample size by cohort and on/off-Academia +sampleSizes <- function(cites_df) { + counts <- cites_df %>% + group_by(year, source) %>% + summarize(count=n()) %>% + reshape2::dcast(year~source) + + totals <- counts %>% select(-year) %>% colSums %>% c(NA, .) + + rbind(counts, totals) %>% + mutate(on=format(on, format='d', big.mark=','), + off=format(off, format='d', big.mark=',')) %>% + knitr::kable(col.names=c('Year', 'On-Academia', 'Off-Academia'), + align=c('l', 'r', 'r')) +} + +# Table 2. Most represented journals in the samples +topJournalsByCount <- function(cites_df) { + cites_df %>% + group_by(journal_title) %>% + summarize(count=n()) %>% + mutate(pct=count/sum(count)) %>% + arrange(-count) %>% + slice(1:10) %>% + mutate(count=format(count, format='d', big.mark=','), + pct=scales::percent(pct)) %>% + knitr::kable(col.names=c('Journal', '# Articles', '% Total'), + align=c('l', 'r', 'r')) +} + + +# Table 3. Journals with highest impact factors +topJournalsByImpactFactor <- function(cites_df) { + cites_df %>% + group_by(journal_title) %>% + summarize(impact_factor=max(impact_factor), topics=first(topics)) %>% + arrange(-impact_factor) %>% + mutate(topics=topics %>% + str_replace_all(., '\\"|\\{|\\}', '') %>% + str_replace_all(., ',', ', ')) %>% + slice(1:10) %>% + ## User pander here instead of kable; pander will split + ## wide column contents into multiple lines to keep table + ## narrow. Have to manually name and align columns, though. + pander::pander() +} + +# Compute % of sample by division for a given on/off-Academia sample, +# or the full sample. +divisionShares <- function(cites_divs, sample=c('on', 'off', 'all')) { + if (sample %in% c('on', 'off')) { + cites_divs <- cites_divs %>% filter(source==sample) + } + + cites_divs %>% + selectDivisionCols %>% + sapply(mean) %>% + (function(x) { data.frame(division=str_replace(names(x), '^Div\\.\\ ', ''), + pct_articles=x, + stringsAsFactors=FALSE) }) %>% + set_names(c('division', paste0('pct_articles_', sample))) +} + +# Compute median impact factor by division. +divisionImpactFactors <- function(cites_divs) { + divnames <- cites_divs %>% selectDivisionCols %>% names + + div_meds <- lapply(as.list(divnames), + function(d) { median(cites_divs[cites_divs[ , d] == 1, + 'impact_factor'], na.rm=TRUE) } ) + data.frame(division=str_replace(divnames, '^Div\\.\\ ', ''), + med_impact_factor=unlist(div_meds)) +} + +# Table 4. Sample shares and impact factors by division. +divisionSharesCombined <- function(cites_df) { + cites_divs <- cites_df %>% addDivisions + + on_shares <- divisionShares(cites_divs, 'on') + off_shares <- divisionShares(cites_divs, 'off') + all_shares <- divisionShares(cites_divs, 'all') + imp_factors <- divisionImpactFactors(cites_divs) + + all_shares %>% + left_join(on_shares, by='division') %>% + left_join(off_shares, by='division') %>% + left_join(imp_factors, by='division') %>% + arrange(-pct_articles_all) %>% + mutate(pct_articles_all=scales::percent(pct_articles_all), + pct_articles_on=scales::percent(pct_articles_on), + pct_articles_off=scales::percent(pct_articles_off)) %>% + knitr::kable(col.names=c('Division', + '% All', + '% On', + '% Off', + 'Med. Imp. Factor'), + align=c('l', 'r', 'r', 'r', 'r'), digits=2) +} + +# Table 5. Share of available-online by sample. +availableOnlineShares <- function(cites_df, cutoff=0) { + cites_df <- cites_df %>% filter(!is.na(avail_online_score)) + cites_df %>% + filter(!is.na(avail_online_score)) %>% + mutate(available_online = avail_online_score > cutoff) %>% + group_by(source, available_online) %>% + summarize(available=n()) %>% + filter(available_online) %>% + select(-available_online) %>% + left_join(cites_df %>% group_by(source) %>% summarize(total=n())) %>% + mutate(pct=scales::percent(available/total), + available=format(available, format='d', big.mark=','), + total=format(total, format='d', big.mark=',')) %>% + t %>% + `[`(-1,) %>% + as.data.frame %>% + `names<-`(c('Off-Academia', 'On-Academia')) %>% + knitr::kable(align=c('r', 'r')) +} + +# Table 6. Summary stats for citations counts. +citationSummaryStats <- function(cites_df) { + with(cites, tapply(citations, list(source), summary)) %>% + do.call(rbind, .) %>% + knitr::kable() +} + +# Table 7. Median citations of on- and off-Academia by impact factor bin. +impactFactorBinsMedians <- function(cites_df) { + pctDiff <- function(on, off) { round(100*(on-off)/off, 0)} + + abs_diffs <- compareByImpactFactorBuckets(cites, median, `-`) + pct_diffs <- compareByImpactFactorBuckets(cites, median, pctDiff) + + tbl <- left_join(abs_diffs %>% rename(abs_diff=comparison), + pct_diffs %>% + select(-off, -on) %>% + rename(pct_diff=comparison), + by=c('year', 'if_bucket')) + + knitr::kable(tbl, + col.names=c('Year', 'Impact Factor Bin', + 'Off-Academia', 'On-Academia', + 'Abs. Diff', '% Diff.'), + align='r') +} + +# Table 8. Regression of citations on impact factor. +citesImpactFactorRegression <- function(cites_df) { + mdl <- lm(log1p(citations) ~ log1p(impact_factor), cites_df) + mdl %>% stargazer::stargazer(type='latex', + keep.stat=c('n', 'rsq'), + star.cutoffs=NA, + dep.var.caption='', + dep.var.labels='Citations (log scale)', + covariate.labels=c('Impact Factor (log scale)', 'Intercept'), + omit.table.layout='n') +} + +# Table 9. Summary stats for regression covariates (exluding divisions). +covariateSummaryStats <- function(cites_df) { + cleanVarName <- function(s) { + s %>% str_replace_all('_', ' ') %>% + (function(s) {paste0(toupper(substring(s, 1,1)), + substring(s,2)) }) + } + getStats <- function(var) { + v <- cites_df %>% select_(var) %>% unlist + data.frame(var=cleanVarName(var), + mean=mean(v, na.rm=TRUE), + median=median(v, na.rm=TRUE), + stdev=sd(v, na.rm=TRUE)) + } + + lapply(list('age', 'impact_factor', 'online'), getStats) %>% + do.call(rbind, .) %>% + knitr::kable(col.names=c('', 'Mean', 'Median', 'Std. Dev.'), + align=c('l', 'r', 'r', 'r'), digits=2) +} + +covariatesSummaryStatsBySample <- function(cites_df) { + cleanVarName <- function(s) { + s %>% str_replace_all('_', ' ') %>% + (function(s) {paste0(toupper(substring(s, 1,1)), + substring(s,2)) }) + } + getStats <- function(var, sample) { + v <- cites_df %>% filter(source == sample) %>% select_(var) %>% unlist + data.frame(source=sample, + var=cleanVarName(var), + mean=mean(v, na.rm=TRUE), + median=median(v, na.rm=TRUE), + stdev=sd(v, na.rm=TRUE)) + } + + on <- lapply(list('age', 'impact_factor', 'online'), + function(x) getStats(x, 'on')) %>% + do.call(rbind, .) + off <- lapply(list('age', 'impact_factor', 'online'), + function(x) getStats(x, 'off')) %>% + do.call(rbind, .) + rbind(on, off) %>% + + knitr::kable(col.names=c('', 'Mean', 'Median', 'Std. Dev.'), + align=c('l', 'r', 'r', 'r'), digits=2) +} + +# Table 10. Linear and Neg. Bin. regression results. +regressionResults <- function(cites_df) { + cat('Preparing Data . . . ') + cites_divs <- addDivisions(cites_df) + cat('DONE.\n') + + cat('Fitting linear model . . . ') + linfit <- runLinearModel(cites_divs) + cat('DONE.\n') + + cat('Fitting Neg. Bin. model . . . ') + nbfit <- runNegBinModel(cites_divs) + cat('DONE.\n') + + memisc::setCoefTemplate('stat.nostar'=c(est='($est:#)', stat='(($stat:#))')) + + mtable('Linear'=linfit, 'Neg. Binom.'=nbfit, + coef.style='stat.nostar', + getSummary = getModelSummary, + summary.stats=c('N', 'R-squared', 'Deviance', + 'Log-likelihood', 'AIC')) %>% + .cleanRegressionTable %>% + pander::pander(style='grid', emphasize.rownames=FALSE) +} + +# Table 11. Share of uncited articles by cohort and off-/on-Academia +shareUncitedArticles <- function(cites_df) { + cites_df %>% + mutate(uncited=citations==0) %>% + group_by(source, year, uncited) %>% + summarize(n=n()) %>% + group_by(year, source) %>% + mutate(pct=n/sum(n)) %>% + filter(uncited) %>% + select(source, year, pct) %>% + reshape2::dcast(year ~ source) %>% + mutate(on=scales::percent(on), off=scales::percent(off)) %>% + knitr::kable(col.names=c('Year', 'Off-Academia', 'On-Academia'), + align='r') +} + +# Table 12. ZINB Regression results. +zinbResults <- function(cites_df) { + cat('Preparing Data . . . ') + cites_divs <- addDivisions(cites_df) + cat('DONE.\n') + + cat('Fitting zero-inflated Neg. Bin. model . . . ') + zinbfit <- runZeroInflNegBinGLM(cites_divs) + cat('DONE.\n') + + zinbfit %>% + mtable(coef.style='stat.nostar', getSummary=getModelSummary, + summary.stats=c('N', 'Log-likelihood', 'AIC')) %>% + .cleanRegressionTable + +} + + +# Make hypothetical input data with which to predict citations. +makeCases <- function(cites_divs, ...) { + impfs <- quantile(cites_divs$impact_factor, probs=c(.1, .5, .9)) + divisions <- cites_divs %>% + selectDivisionCols %>% + summarise_each(funs(mean)) + + case_df = data.frame( + on_acad=NULL, + impact_factor=NULL, + age=NULL, + online=NULL) + + # Impact factors 10%, 50%, 90%-iles + for (i in impfs) { + # Paper age 1-5 years + for (a in 1:5) { + # Off and on-Academia + for (oa in c(0, 1)) { + # Unavailable and available online elsewhere + for (on in c(0,1)) { + case <- data.frame(on_acad=oa, + impact_factor=i, + age=a, + online=on) + case_df <- rbind(case_df, case) + } + } + } + } + cbind(case_df, divisions) +} + +# Predict citations from the hypothetical data +# using Linear, NB, and ZINB models +predictCases <- function(cites_df) { + + cat('Preparing Data . ') + cites_divs <- addDivisions(cites_df) + cat('DONE\n') + + cat('Fitting linear model . . . ') + linfit <- runLinearModel(cites_divs) + cat('DONE.\n') + cat('Fitting Neg. Bin. model . . . ') + nbfit <- runNegBinModel(cites_divs) + cat('DONE.\n') + cat('Fitting ZINB model . . . ') + zinbfit <- runZeroInflNegBinGLM(cites_divs) + cat('DONE.\n') + + cases <- makeCases(cites_divs) + + preds_lin = predict(linfit, cases) %>% expm1 + preds_nb = predict(nbfit, cases, type='response') + preds_zinb = predict(zinbfit, cases, type='response') + + list(preds_lin, preds_nb, preds_zinb) %>% + do.call(cbind, .) %>% + as.data.frame %>% + set_names(c('linear', 'negbin', 'zinb')) %>% + cbind(cases, .) %>% + reshape2::melt(id.vars=c('on_acad', 'age', 'impact_factor', 'online'), + measure.vars=c('linear', 'negbin', 'zinb'), + variable.name='model', + value.name='citations') %>% + mutate(on_acad=factor(on_acad, labels=c('N', 'Y')), + online=factor(online, labels=c('N', 'Y')), + impact_factor=factor(impact_factor, labels=c('10th', + '50th', + '90th'))) + +} + +# Compute predicted citation advantages. +# The advantage is defined as: +# E(Cites_i | X_i, on-Academia, not-online) / E(Cites_i | X_i, off-Academia, not-online) - 1 +advantagePreds <- function(preds) { + preds %>% group_by(model, impact_factor, age) %>% + mutate(advantage=citations / first(citations)-1) + +} + +# Format a dataframe of predictions as a table. +makePredictionTable <- function(preds) { + preds %>% + reshape2::dcast(model + impact_factor + on_acad + online ~ age) %>% + knitr::kable(col.names=c('Model', 'IF Pctile', + 'On-Academia', 'Online', '1 Year', + '2 Years', '3 Years', '4 Years', '5 Years'), + digits=2) +} + +# Table 13. Lookup table of predicted citations for different models and +# inputs. +predictedCitations <- function(cites_df) { + cites_df %>% predictCases %>% makePredictionTable +} + +# Table 14. Lookup table of predicted advantages. +predictedAdvantages <- function(cites_df) { + cites_df %>% predictCases %>% advantagePreds %>% makePredictionTable +} + +# Create rows of hypothetical articles each from one research division. +makeDivisionCases <- function(cites_divs) { + cases <- makeCases(cites_divs) + divnames <- cites_divs %>% selectDivisionCols %>% names + + makeDivCase <- function(div, cases) { + cases[ , divnames] <- 0 + cases[ , div] <- 1 + cases$division = str_replace(div, '^Div\\.\\ ', '') + cases + } + + lapply(as.list(divnames), function(x) makeDivCase(x, cases)) %>% do.call(rbind, .) +} + +# Table 15. Predicted citations and advantage by division, for a 5-year-old +# paper, not online, and at the median impact factor for the given division. +divisionPredictedCitations <- function(cites_df) { + cites_divs <- addDivisions(cites_df) + linfit <- runLinearModel(cites_divs) + + div_impfs <- cites_divs %>% + divisionImpactFactors %>% + mutate(division = paste0('Div. ', division)) %>% + rename(impact_factor=med_impact_factor) %>% + reshape2::dcast(division + impact_factor ~ division, length, value.var='impact_factor') + + + cases <- list(0, 1) %>% + lapply(function(x) {div_impfs %>% mutate(on_acad = x, + online = 0, + age = 5) }) %>% + do.call(rbind, .) + + preds <- predict(linfit, cases) %>% expm1 + + shares <- divisionShares(cites_divs, 'on') %>% + left_join(divisionShares(cites_divs, 'off'), by='division') %>% + set_names(c('division', 'share_on', 'share_off')) + + cases %>% + cbind(preds) %>% + mutate(on_acad=ifelse(on_acad==1, 'Y', 'N')) %>% + select(division, impact_factor, on_acad, preds) %>% + mutate(division = str_replace(division, '^Div\\.\\ ', '')) %>% + left_join(shares, by='division') %>% + reshape2::dcast(division + impact_factor + share_on + share_off ~ on_acad, + value.var='preds') %>% + mutate(share_on=scales::percent(share_on), + share_off=scales::percent(share_off), + difference=Y-N, + advantage=difference / N) %>% + arrange(-advantage) %>% + mutate(advantage=scales::percent(advantage)) %>% + knitr::kable(digits=2, + align=c('l', 'r', 'r', 'r', 'r', 'r', 'r', 'r'), + col.names=c('Division', + 'Med. IF', + '% On', '% Off', + 'Cites On', 'Cites Off', + 'Diff.', '% Adv.')) +} diff --git a/acadcites/acadcites.Rproj b/acadcites/acadcites.Rproj new file mode 100644 index 0000000..a4f37b6 --- /dev/null +++ b/acadcites/acadcites.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: No + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/acadcites/data/._available_online.sql b/acadcites/data/._available_online.sql new file mode 100644 index 0000000..7ad2b20 Binary files /dev/null and b/acadcites/data/._available_online.sql differ diff --git a/acadcites/data/._papers.csv.gz b/acadcites/data/._papers.csv.gz new file mode 100644 index 0000000..16fbd20 Binary files /dev/null and b/acadcites/data/._papers.csv.gz differ diff --git a/acadcites/inst/extdata/arc_divisions.csv b/acadcites/inst/extdata/arc_divisions.csv new file mode 100644 index 0000000..38841b8 --- /dev/null +++ b/acadcites/inst/extdata/arc_divisions.csv @@ -0,0 +1 @@ +code,division 01,Mathematical Sciences 02,Physical Sciences 03,Chemical Sciences 04,Earth Sciences 05,Environmental Sciences 06,Biological Sciences 07,Agricultural and Veterinary Sciences 08,Information and Computing Sciences 09,Engineering 10,Technology 11,Medical and Health Sciences 12,Built Environment and Design 13,Education 14,Economics 15,"Commerce, Management, Tourism and Services" 16,Studies in Human Society 17,Psychology and Cognitive Sciences 18,Law and Legal Studies 19,Studies in Creative Arts and Writing 20,"Language, Communication and Culture" 21,History and Archaeology 22,Philosophy and Religious Studies \ No newline at end of file diff --git a/acadcites/inst/extdata/era_2012_journal_list.csv.gz b/acadcites/inst/extdata/era_2012_journal_list.csv.gz new file mode 100644 index 0000000..c1957e8 Binary files /dev/null and b/acadcites/inst/extdata/era_2012_journal_list.csv.gz differ diff --git a/acadcites/inst/extdata/papers.csv.gz b/acadcites/inst/extdata/papers.csv.gz new file mode 100644 index 0000000..ec26b3f Binary files /dev/null and b/acadcites/inst/extdata/papers.csv.gz differ diff --git a/acadcites/man/addDivisions.Rd b/acadcites/man/addDivisions.Rd new file mode 100644 index 0000000..35dea81 --- /dev/null +++ b/acadcites/man/addDivisions.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/research_divisions.R +\name{addDivisions} +\alias{addDivisions} +\title{Append columns of research division dummy variables to the citations + data frame.} +\usage{ +addDivisions(cites_df) +} +\arguments{ +\item{cites_df}{The citations data frame.} +} +\value{ +The original citations dataframe with the division columns added. +} +\description{ +One column is added for each division represented by any article in +the citations data frame. The columns have 0/1 values, indicating which +articles are in that research division. +} + diff --git a/acadcites/man/compareByImpactFactorBuckets.Rd b/acadcites/man/compareByImpactFactorBuckets.Rd new file mode 100644 index 0000000..20ebbc6 --- /dev/null +++ b/acadcites/man/compareByImpactFactorBuckets.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/bucket_analysis.R +\name{compareByImpactFactorBuckets} +\alias{compareByImpactFactorBuckets} +\title{Compare on- and off-Academia citations within years and quantile groups +of journal impact factors.} +\usage{ +compareByImpactFactorBuckets(cites_df, summarizer = mean, comparator = `/`, + ...) +} +\arguments{ +\item{cites_df}{A dataframe with citations and impact factors.} + +\item{summarizer}{(default mean) A function to summarize citations within groups.} + +\item{comparator}{(default `/`) A function with arguments (on, off), that compares +on and off-Academia citation summaries. The default computes the on/off ratio.} + +\item{...}{Extra parameters to `quantileBuckets`} +} +\value{ +A dataframe with statistic by year, impact factor group, and on/off-source. +} +\description{ +Compare on- and off-Academia citations within years and quantile groups +of journal impact factors. +} + diff --git a/acadcites/man/importData.Rd b/acadcites/man/importData.Rd new file mode 100644 index 0000000..275c5a3 --- /dev/null +++ b/acadcites/man/importData.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/import_data.R +\name{importData} +\alias{importData} +\title{Import and combine paper citations, paper downloads, and journal information.} +\usage{ +importData(cites_path = NULL, journals_path = NULL, ...) +} +\arguments{ +\item{cites_path}{A path to a file with paper citation data for papers both +on and off Academia,} + +\item{journals_path}{A path to a file with data on academic journals.} + +\item{...}{Other arguments to import and join functions.} +} +\value{ +A dataframe of citation, download, and journal data merged together. +} +\description{ +Import and combine paper citations, paper downloads, and journal information. +} + diff --git a/acadcites/man/makeFigure.Rd b/acadcites/man/makeFigure.Rd new file mode 100644 index 0000000..1453e82 --- /dev/null +++ b/acadcites/man/makeFigure.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/figures.R +\name{makeFigure} +\alias{makeFigure} +\title{Function to generate figures from the paper.} +\usage{ +makeFigure(n, cites_df, ...) +} +\arguments{ +\item{n}{Figure caption number.} + +\item{cites_df}{A data frame with article citations and journal data, as produced by `importData`.} + +\item{...}{Optional arguments passed to figure functions.} +} +\value{ +Nothing. Renders a plot. +} +\description{ +Recreate a figure with a given citations dataset by specifying the table's +caption number in the paper. +} + diff --git a/acadcites/man/makeFormula.Rd b/acadcites/man/makeFormula.Rd new file mode 100644 index 0000000..1918f93 --- /dev/null +++ b/acadcites/man/makeFormula.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/run_r_models.R +\name{makeFormula} +\alias{makeFormula} +\title{Create a formula describing a regression of citations on covariates.} +\usage{ +makeFormula(df, model = c("linear", "logistic", "negbin", "zinb"), + division_interactions = TRUE) +} +\arguments{ +\item{df}{A dataframe with citations data, and divisions as created by +`importData` followed by `addDivisions`} + +\item{model}{One of 'linear', 'logistic', 'negbin', or 'zinb', the last for +zero-inflated negative binomial model.} + +\item{division_interactions}{If TRUE (default), interact each division dummy +with the on-Academia dummy.} +} +\value{ +An R formula. +} +\description{ +Creates a formula of the form `y ~ ...` where `y` is the citation +count variable, possibly transformed, and `...` are covariates and their +interactions. +} +\details{ +The linear model log-transforms the citations variable, while the negative +binomial and ZINB models do not. The logistic model transforms citations +into a 0/1 variable where 1 indicates the article received at least one +citation. +} + diff --git a/acadcites/man/makeTable.Rd b/acadcites/man/makeTable.Rd new file mode 100644 index 0000000..3a217f4 --- /dev/null +++ b/acadcites/man/makeTable.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/tables.R +\name{makeTable} +\alias{makeTable} +\title{Function to generate tables from the paper.} +\usage{ +makeTable(n, cites_df, ...) +} +\arguments{ +\item{n}{Table caption number.} + +\item{cites_df}{A data frame with article citations and journal data, as produced by `importData`.} + +\item{...}{Optional arguments passed to table functions.} +} +\value{ +Nothing. Prints a table, usually in markdown, but possibly plain text or LaTeX. +} +\description{ +Recreate a table, in markdown format with a given citations +dataset by specifying the table's caption number in the paper. +} + diff --git a/acadcites/man/meltOnSets.Rd b/acadcites/man/meltOnSets.Rd new file mode 100644 index 0000000..0e27517 --- /dev/null +++ b/acadcites/man/meltOnSets.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/set_melting.R +\name{meltOnSets} +\alias{meltOnSets} +\title{Repeat rows of a dataframe for each element in a set.} +\usage{ +meltOnSets(df, col, prefix, ...) +} +\arguments{ +\item{df}{A dataframe} + +\item{col}{A column in dataframe that is a character vector containing string +representations of sets.} + +\item{prefix}{A string to name the column of set elements} + +\item{...}{Other arguments to pass to `setParse`.} +} +\value{ +A 'molten' dataframe. +} +\description{ +Repeat rows of a dataframe for each element in a set. +} +\examples{ +mydf <- data.frame(x=c(1, 2), ids=c('{1,2,3}', '{5, 10}')) +meltOnSets(mydf, 'ids', prefix='id', cast_to='integer') +# x ids variable id +# 1 1 {1,2,3} id_1 1 +# 2 2 {5, 10} id_1 5 +# 3 1 {1,2,3} id_2 2 +# 4 2 {5, 10} id_2 10 +# 5 1 {1,2,3} id_3 3 +} + diff --git a/acadcites/man/paperAgeYears.Rd b/acadcites/man/paperAgeYears.Rd new file mode 100644 index 0000000..4211227 --- /dev/null +++ b/acadcites/man/paperAgeYears.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/import_data.R +\name{paperAgeYears} +\alias{paperAgeYears} +\title{Estimate the age of a paper in years.} +\usage{ +paperAgeYears(date_collected, published_year) +} +\arguments{ +\item{date_collected}{A Date vector of dates when the articles citations +were recorded.} + +\item{published_year}{An integer vector of articles' publication years.} +} +\value{ +A numeric vector of years between publication and citation +collection for each article. +} +\description{ +Since we only have data on publication year, we approximate an article's +publication date as June 30 of its publication year. +} +\examples{ +colln_dates = as.Date(c('2014-06-19', '2014-05-18', '2014-07-03')) +pub_years = c(2010, 2011, 2012) +paperAgeYears(colln_dates, pub_years) +} + diff --git a/acadcites/man/plotByImpactFactorBuckets.Rd b/acadcites/man/plotByImpactFactorBuckets.Rd new file mode 100644 index 0000000..b8658e4 --- /dev/null +++ b/acadcites/man/plotByImpactFactorBuckets.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/bucket_analysis.R +\name{plotByImpactFactorBuckets} +\alias{plotByImpactFactorBuckets} +\title{Boxplots on- and off-Academia citations within years and quantile groups +of journal impact factors.} +\usage{ +plotByImpactFactorBuckets(cites_df, ...) +} +\arguments{ +\item{cites_df}{A dataframe with citations and impact factors.} + +\item{...}{Extra parameters to `quantileBuckets`} +} +\value{ +A ggplot2 plot. +} +\description{ +Boxplots on- and off-Academia citations within years and quantile groups +of journal impact factors. +} + diff --git a/acadcites/man/plotLogScale.Rd b/acadcites/man/plotLogScale.Rd new file mode 100644 index 0000000..262d530 --- /dev/null +++ b/acadcites/man/plotLogScale.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/plot_utils.R +\name{plotLogScale} +\alias{plotLogScale} +\title{Scale a plot's axis or axes using a x->log(1+x) transformation.} +\usage{ +plotLogScale(plot, xy = c("x", "y", c("x", "y"))) +} +\arguments{ +\item{plot}{A ggplot2 plot with a defined aesthetic mapping} + +\item{xy}{Axes to scale: 'x', 'y' or c('x', 'y')} +} +\value{ +A new ggplot2 plot with scaled axes. The axis labels +will still reflect the raw data. +} +\description{ +Scale a plot's axis or axes using a x->log(1+x) transformation. +} + diff --git a/acadcites/man/quantileBuckets.Rd b/acadcites/man/quantileBuckets.Rd new file mode 100644 index 0000000..f0d7a28 --- /dev/null +++ b/acadcites/man/quantileBuckets.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/bucket_analysis.R +\name{quantileBuckets} +\alias{quantileBuckets} +\title{Group a variable into buckets based on its quantiles or those of another +variable.} +\usage{ +quantileBuckets(x_quantile, x_bucket = x_quantile, nbuckets = 10, + probs = NULL) +} +\arguments{ +\item{x_quantile}{The variable to calculate quantile buckets from.} + +\item{x_bucket}{The variable to collect into the quantile buckets.} + +\item{nbuckets}{The number of quantile buckets to use. Specify this *or* +`probs`, but not both.} + +\item{probs}{The vector of probabilities for the quantile bucket cut points. +Specify this *or* `nbuckets`, but not both.} +} +\value{ +A factor vector corresponding to `x_bucket` with bucket ranges. +If an element of `x_bucket` is outside of the range of `x_quantile`, its +bucket will be NA. +} +\description{ +Group a variable into buckets based on its quantiles or those of another +variable. +} + diff --git a/acadcites/man/runLinearModel.Rd b/acadcites/man/runLinearModel.Rd new file mode 100644 index 0000000..92962c0 --- /dev/null +++ b/acadcites/man/runLinearModel.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/run_r_models.R +\name{runLinearModel} +\alias{runLinearModel} +\title{Fit a linear model of citations using lm.} +\usage{ +runLinearModel(df, ...) +} +\arguments{ +\item{df}{A dataframe with citations data, and +divisions as created by `importData` followed by +`addDivisions`} + +\item{...}{Extra arguments to `lm`} +} +\value{ +An `lm` object. +} +\description{ +Fit a linear model of citations using lm. +} + diff --git a/acadcites/man/runLogisticModel.Rd b/acadcites/man/runLogisticModel.Rd new file mode 100644 index 0000000..5523a8b --- /dev/null +++ b/acadcites/man/runLogisticModel.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/run_r_models.R +\name{runLogisticModel} +\alias{runLogisticModel} +\title{Fit a Logistic model of citations using glm.} +\usage{ +runLogisticModel(df, ...) +} +\arguments{ +\item{df}{A dataframe with citations data, and +divisions as created by `importData` followed by +`addDivisions`} + +\item{...}{Extra arguments to `glm`.} +} +\value{ +A fitted GLM object. +} +\description{ +Fit a Logistic model of citations using glm. +} + diff --git a/acadcites/man/runNegBinModel.Rd b/acadcites/man/runNegBinModel.Rd new file mode 100644 index 0000000..7400fd1 --- /dev/null +++ b/acadcites/man/runNegBinModel.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/run_r_models.R +\name{runNegBinModel} +\alias{runNegBinModel} +\title{Fit a Negative Binomial model of citations using glm.} +\usage{ +runNegBinModel(df, ...) +} +\arguments{ +\item{df}{A dataframe with citations data, and +keyword factors as created by `importData` followed by +`addDivisions`} + +\item{...}{Extra arguments to `MASS::glm.nb`} +} +\value{ +A fitted GLM object. +} +\description{ +Fit a Negative Binomial model of citations using glm. +} + diff --git a/acadcites/man/runZeroInflNegBinGLM.Rd b/acadcites/man/runZeroInflNegBinGLM.Rd new file mode 100644 index 0000000..ac1c595 --- /dev/null +++ b/acadcites/man/runZeroInflNegBinGLM.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/run_r_models.R +\name{runZeroInflNegBinGLM} +\alias{runZeroInflNegBinGLM} +\title{Fit a Zero Inflated Negative Binomial model of +citations.} +\usage{ +runZeroInflNegBinGLM(df, ...) +} +\arguments{ +\item{df}{A dataframe with citations data, and +divisions as created by `importData` followed by +`addDivisions`} + +\item{...}{Extra arguments to `pscl::zeroinfl`} +} +\value{ +A fitted `zeroinfl` object. +} +\description{ +Uses `pscl::zeroinfl` to fit the model. For parameters initial guesses +we use the coefficients from a logistic regression +(for the 'zero' parameters) and from a negative binomial regression +(for the 'count' parameters). +} + diff --git a/acadcites/man/selectDivisionCols.Rd b/acadcites/man/selectDivisionCols.Rd new file mode 100644 index 0000000..797dc06 --- /dev/null +++ b/acadcites/man/selectDivisionCols.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/research_divisions.R +\name{selectDivisionCols} +\alias{selectDivisionCols} +\title{Select the FoR division data columns from a data frame} +\usage{ +selectDivisionCols(df, pattern = "Div\\\\.") +} +\arguments{ +\item{df}{A dataframe with division columns.} + +\item{pattern}{(default "^div\\.") A regex pattern +for detecting division column names.} +} +\value{ +A dataframe of only the division columns. +} +\description{ +Select the FoR division data columns from a data frame +} + diff --git a/acadcites/man/setParse.Rd b/acadcites/man/setParse.Rd new file mode 100644 index 0000000..d4f1de9 --- /dev/null +++ b/acadcites/man/setParse.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/set_melting.R +\name{setParse} +\alias{setParse} +\title{Parse a vectors of string representations of sets to a list of vectors} +\usage{ +setParse(set_str, cast_to = NULL, prefix = "v") +} +\arguments{ +\item{set_str}{A character vector of string representations of sets.} + +\item{cast_to}{The name of a data type to cast the elements of the set to.} + +\item{prefix}{A prefix to name columns in the data frame (default 'v').} +} +\value{ +A data frame with columns equal to the largest set's cardinality. +} +\description{ +Parse a vectors of string representations of sets to a list of vectors +} +\examples{ +setParse(c('{1, 2, 3}', '{2, 3}'), 'integer') +# v1 v2 v3 +# 1 1 2 3 +# 2 2 3 NA +setParse(c('{a, b}', '{}')) +# v1 v2 +# 1 a b +} + diff --git a/acadcites/man/summarizeOverBuckets.Rd b/acadcites/man/summarizeOverBuckets.Rd new file mode 100644 index 0000000..4490993 --- /dev/null +++ b/acadcites/man/summarizeOverBuckets.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/bucket_analysis.R +\name{summarizeOverBuckets} +\alias{summarizeOverBuckets} +\title{Average results over buckets, weighting by the number of on-Academia +articles in the bucket.} +\usage{ +summarizeOverBuckets(cites_df) +} +\arguments{ +\item{cites_df}{A dataframe of citations with impact factors.} +} +\value{ +A dataframe of weighted average results by year. +} +\description{ +Average results over buckets, weighting by the number of on-Academia +articles in the bucket. +} + diff --git a/acadcites/man/toSetStr.Rd b/acadcites/man/toSetStr.Rd new file mode 100644 index 0000000..b0fd981 --- /dev/null +++ b/acadcites/man/toSetStr.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2 (4.0.2.9000): do not edit by hand +% Please edit documentation in R/set_melting.R +\name{toSetStr} +\alias{toSetStr} +\title{Combine columns of a dataframe into a string set representation.} +\usage{ +toSetStr(df) +} +\arguments{ +\item{df}{A data frame whose columns will be combined into a set.} +} +\value{ +A character vector of string representation forms. +} +\description{ +Note that the uniqueness of the set is not ensured in this +function. If a row of the input data frame has the same data +in multiple columns, the resulting set representation will +have non-unique elements. +} +\examples{ +myDF <- data.frame(x=c(1,2,3), y=c('a', 'b', 'c')) +toSetStr(myDF) # {'1', 'a'} {'2', 'b'} {'3', 'c'} +} + diff --git a/acadcites/tests/testthat.R b/acadcites/tests/testthat.R new file mode 100644 index 0000000..bbc95c0 --- /dev/null +++ b/acadcites/tests/testthat.R @@ -0,0 +1,4 @@ +library('testthat') +library('acadcites') + +test_check("acadcites") diff --git a/acadcites/tests/testthat/test_set_functions.R b/acadcites/tests/testthat/test_set_functions.R new file mode 100644 index 0000000..e1b2f5b --- /dev/null +++ b/acadcites/tests/testthat/test_set_functions.R @@ -0,0 +1,41 @@ +context('Manipulating string set representations.') + + +test_that('setParse returns expected output', { + expect_equal(setParse(c('{1, 2, 3}', '{a, b}')), + data.frame(v1=c('1', 'a'), + v2=c('2', 'b'), + v3=c('3', NA), stringsAsFactors=FALSE)) }) + + +test_that('setParse casts to types as expected.', { + expect_equal(setParse(c('{1, 2, 3}', '{10, 100, 150}'), 'integer'), + data.frame(v1=c(1,10), + v2=c(2,100), + v3=c(3,150))) +}) + +test_that('setParse removes quotes from strings.', { + expect_equal(setParse(c('{"Hello World", Quux}', '{Baz, "Foo Bar"}')), + data.frame(v1=c('Hello World', 'Baz'), + v2=c('Quux', 'Foo Bar'), + stringsAsFactors=FALSE)) + + expect_equal(setParse(c('{\'Hello World\', Quux}', '{Baz, \'Foo Bar\'}')), + data.frame(v1=c('Hello World', 'Baz'), + v2=c('Quux', 'Foo Bar'), + stringsAsFactors=FALSE)) +}) + +test_that('setParse ignores commas inside sets.', { + expect_equal(setParse(c('{"Hello, World", Quux}', '{Baz, "Foo, Bar"}')), + data.frame(v1=c('Hello, World', 'Baz'), + v2=c('Quux', 'Foo, Bar'), + stringsAsFactors=FALSE)) + + expect_equal(setParse(c('{\'Hello, World\', Quux}', '{Baz, \'Foo, Bar\'}')), + data.frame(v1=c('Hello, World', 'Baz'), + v2=c('Quux', 'Foo, Bar'), + stringsAsFactors=FALSE)) +}) + diff --git a/acadcites/vignettes/acadcites.Rmd b/acadcites/vignettes/acadcites.Rmd new file mode 100644 index 0000000..2787595 --- /dev/null +++ b/acadcites/vignettes/acadcites.Rmd @@ -0,0 +1,54 @@ +--- +title: "acadcites: Academia Citation Advantage Analysis" +author: "C. Vogel" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{acadcites: Academia Citation Advantage Analysis} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +The `acadcites` package contains the code and data used by Niyazov, et. al., +"Open Access Meets Discoverability: Citations to Articles Posted to Academia.edu." + +## Importing the data + +To import the combined article and journal data, use the function `importData`. +This will read in two raw datasets: + +1. One with information about papers in the on- and off-Academia sets, including + - citation counts from Google Scholar + - an indicator of whether it appeared to be available for free online from a non-Academia source + - the ISSN of its journal + - the impact factor of its journal +2. One with information about journals from the Australian Research Council, including + - ISSN (many journal have multiple ISSNs) + - Fields of Research, as classified by the ANZSRC. + +Calling `importData()` will import these raw files, clean them, and merge them +on ISSN. All analyses in the paper are based off this combined dataset. + +```{r} +library('acadcites') +cites <- importData() +``` + +## Replicating tables from the article +The `makeTable` function can be used to reproduce any table in the article. `maketable` is called with two arguments: a table number and a dataframe with the articles/journals dataset (as would be returned from `importData()`). + +For example to reproduce Table 12 from the article: + +```{r} +makeTable(10, cites) +``` + + +## Replicating tables from the article +The `makeFigure` function reproduces figures from the article. Like `makeTable`, +it takes a figure number and a citations data frame. + +```{r} +makeFigure(1, cites) +``` + diff --git a/acadcites_0.1.tar.gz b/acadcites_0.1.tar.gz new file mode 100644 index 0000000..fe2a04c Binary files /dev/null and b/acadcites_0.1.tar.gz differ diff --git a/academia_citations.pdf b/academia_citations.pdf new file mode 100644 index 0000000..043fc21 Binary files /dev/null and b/academia_citations.pdf differ diff --git a/data/arc_divisions.csv b/data/arc_divisions.csv new file mode 100644 index 0000000..38841b8 --- /dev/null +++ b/data/arc_divisions.csv @@ -0,0 +1 @@ +code,division 01,Mathematical Sciences 02,Physical Sciences 03,Chemical Sciences 04,Earth Sciences 05,Environmental Sciences 06,Biological Sciences 07,Agricultural and Veterinary Sciences 08,Information and Computing Sciences 09,Engineering 10,Technology 11,Medical and Health Sciences 12,Built Environment and Design 13,Education 14,Economics 15,"Commerce, Management, Tourism and Services" 16,Studies in Human Society 17,Psychology and Cognitive Sciences 18,Law and Legal Studies 19,Studies in Creative Arts and Writing 20,"Language, Communication and Culture" 21,History and Archaeology 22,Philosophy and Religious Studies \ No newline at end of file diff --git a/data/era_2012_journal_list.csv.gz b/data/era_2012_journal_list.csv.gz new file mode 100644 index 0000000..c1957e8 Binary files /dev/null and b/data/era_2012_journal_list.csv.gz differ diff --git a/data/papers.csv.gz b/data/papers.csv.gz new file mode 100644 index 0000000..ec26b3f Binary files /dev/null and b/data/papers.csv.gz differ