In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import os
import random

def show_df_as_scrolling_table(df):
    # Puts the scrollbar next to the DataFrame
    # https://stackoverflow.com/questions/42724327/pandas-dataframe-table-vertical-scrollbars
    from IPython.core.display import HTML
    display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" +
                df.to_html() +
                "</div>"))


In [10]:
PARAGRAPH_FILE = "paragraph_data.xlsx"

if os.path.exists(PARAGRAPH_FILE):
    selected_articles = pd.read_excel(PARAGRAPH_FILE)
else:
    import psycopg2
    from sqlalchemy import create_engine, text

    import json


    with open("credentials.json", 'r') as fh:
        pmc_credentials = json.load(fh)

    connection_string = 'postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}'.format(**pmc_credentials)

    engine = create_engine(connection_string)
    Q = r"""
    with
    ji0 as (
        select *, substring(year from 1 for 4) y2 from journal_issue
        where year ~ '^\d\d\d\d'
        and substring(year from 1 for 4) = year
    )
    ,
    ji as (
        select *, y2::int year_int from ji0
    )
    ,
    pcounts as (
        select pmid, count(*) num_paragraphs from paragraph group by pmid
    )
    ,
    articles as (
        select distinct d.pmid
            from descriptor d
            join descriptor_detail dd on d.descriptor_id = dd.id
            join ji on d.pmid = ji.pmid
            join pcounts on pcounts.pmid = d.pmid
            where dd.name = 'Drug Therapy'
            and ji.year_int > 2000
            and pcounts.num_paragraphs > 10
        limit 300
    )
    select p.pmid, p.paragraph_number, p.section_path, p.text 
    from paragraph p join articles a on p.pmid = a.pmid
    where length(p.text) > 1
    order by pmid, paragraph_number;
    """

    selected_articles = pd.read_sql(Q, con=engine)
    selected_articles.to_excel(PARAGRAPH_FILE)


selected_articles
# 

Unnamed: 0.1,Unnamed: 0,pmid,paragraph_number,section_path,text
0,0,11953888,0,Title,How to calculate the dose of chemotherapy
1,1,11953888,1,Abstract,Body surface area-dosing does not account for ...
2,2,11953888,2,Abstract,"British Journal of Cancer (2002) 86, 1297–1302..."
3,3,11953888,3,Abstract,© 2002 Cancer Research UK
4,4,11953888,4,,Despite the recent advances in anticancer trea...
...,...,...,...,...,...
11522,11522,32702853,39,Author contributions,Funding acquisition: Xu Zhou
11523,11523,32702853,40,Author contributions,Supervision: Weifeng Zhu
11524,11524,32702853,41,Author contributions,Methodology: Weifeng Zhu
11525,11525,32702853,42,Author contributions,"Writing – original draft: Shuqing Li, Jianrong..."


In [3]:
show_df_as_scrolling_table( selected_articles[ [ r'\usepackage' in x for x in selected_articles['text']] ] )

Unnamed: 0.1,Unnamed: 0,pmid,paragraph_number,section_path,text
5235,5235,26699711,16,Availability of medicines by therapeutic category,"The medicines were classified into therapeutic categories. A drug was considered available if it was in stock on the day of the survey. Availability of a therapeutic category in a facility is explained by the formula: (n/N) * 100, where n is the number of medicines available within that category in a facility on the day of the survey and N is the total number of medicines within that category that should be available as per the list of medicines prepared. For a particular level of facility (say for example PHC level in Punjab), overall availability of a particular category of medicine is given by the formula:\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$$ \frac{{\displaystyle \sum {\left({\mathbf{n}}_{\mathbf{i}}\right)}^{*}\mathbf{100}}}{{\mathbf{M}}^{*}\mathbf{N}} $$\end{document}∑ni*100M*N"
5237,5237,26699711,18,Availability of medicines by therapeutic category,Overall availability of a particular category of medicine across all levels of care in a state is given by the formula:\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$$ \frac{{\displaystyle \sum \left({\mathbf{n}}_{\mathbf{i}}\right)\ast \mathbf{100}}}{{\displaystyle \sum {\mathbf{M}}_{\mathbf{i}}\ast \mathbf{N}\mathbf{i}}} $$\end{document}∑ni∗100∑Mi∗Ni
5389,5389,26818006,20,Methods || Resolving conflicts in DKEGG,"To resolve this problem, we searched all possible shortest paths and set a distance dc(r, g) from a drug target r to a disease gene g with the consideration of the conflict problem by1\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$$ {d}_c\left(r,g\right)=\frac{n_a+{n}_i}{n_a-{n}_i}\left|d\left(r,g\right)\right| $$\end{document}dcrg=na+nina−nidrg"
5391,5391,26818006,22,Methods || A score between a drug and a disease,"For each drug and disease pair, we calculated a score using drug-target interactions, distances from drug targets disease genes with consideration of conflicts, and the regulated states of disease genes. In our PDOD approach, a PDOD score between a drug R having drug targets ri and a disease G with its associated disease genes gj was computed as below:2\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$$ \begin{array}{l}\\ {}\mathrm{PDOD}\kern0.5em \mathrm{score}\left(R,G\right)=\frac{1}{n_g{n}_r}{\displaystyle \sum_{i=1}^{n_r}}{\displaystyle \sum_{j=1}^{n_g}}sgn\left({r}_i,\ {g}_j\right)\times \frac{1}{1+{\left|\frac{d_c\left({r}_i,{g}_j\right)}{\alpha}\right|}^2}\\ {}\end{array} $$\end{document}PDODscoreRG=1ngnr∑i=1nr∑j=1ngsgnri,gj×11+dcrigjα23\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$$ sgn\left({r}_i,\ {g}_j\right)=i\left({r}_i\right)\times sign\left({d}_c\left({r}_i,{g}_j\right)\right)\times s\left({g}_j\right) $$\end{document}sgnri,gj=iri×signdcrigj×sgj"
5895,5895,27143038,10,Implementation || Improved connectivity mapping algorithm,"The determination of the connection score for an instance is similar to that used by the sscMap software [7] - \n\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document} $$c(\mathbf{R}, \mathbf{s})= \frac{\sum_{i=1}^{m}R(g_{i})s(g_{i})} {\sum_{i=1}^{m}N-i+1} $$ \end{document}c(R,s)=∑i=1mR(gi)s(gi)∑i=1mN−i+1"
5898,5898,27143038,13,Implementation || Signature contribution fraction,"Although the main application of connectivity mapping is the identification of potential drugs related to the signature presented, it is often interesting from a biological perspective to identify the importance of any particular gene in the signature to a particular connection. This may, for example help provide information as to the mechanism of action of the drug, or for grouping similar drugs. As such, QUADrATiC enables the user to investigate the effects of probes in a quantitative manner. The determination of the effect of each probe on the connection strength for a treatment set is based on a value called the Contribution Fraction (CF) of the probe. First, define the “diminished score”, for the kth probe in the signature \n\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document} $$c^{*}_{k}(\mathbf{R}, \mathbf{s})= \frac{\sum_{i=1, i \neq k}^{m}R(g_{i})s(g_{i})} {\sum_{i=1}^{m}N-i+1} $$ \end{document}ck∗(R,s)=∑i=1,i≠kmR(gi)s(gi)∑i=1mN−i+1 For median set scoring, as implemented by QUADrATiC, we can then define the Contribution Fraction of the kth probe, CFk, as follows \n\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document} $${CF}_{k}=1-\frac{\widetilde{c^{*}_{k}}(\mathbf{R}, \mathbf{s})}{\widetilde{c}(\mathbf{R}, \mathbf{s})} $$ \end{document}CFk=1−ck∗~(R,s)c~(R,s) i.e. the magnitude of the difference between the median score and the median diminished score (as denoted with a tilde, ∼) for the treatment set. Calculating the CF over all probes for all treatment sets can be used to investigate the influence of the signature probes on the result set. We further normalize this within the individual reference sets (where \documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$CF^{*}_{k} = 1.0$\end{document}CFk∗=1.0 for the probe making the highest contribution to the connection score for that set) - \n\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document} $$CF^{*}_{k}=\frac{{CF}_{k}}{\max\limits_{k}({CF}_{k})} $$ \end{document}CFk∗=CFkmaxk(CFk)"
5899,5899,27143038,14,Implementation || Graphical user interface (GUI),"The GUI is developed using HTML and Javascript - providing a simple, extensible and industry- standard approach to interfacing with the user within a modern browser. The choice of javascript allows for easy use of the Bootstrap framework [21], and d3 visualisation library, [22], to provide a modern, simple interface. The GUI was designed around a simple linear workflow, which is shown, along with screenshots for the different stages of the analysis, in Fig. 3. There are six screens available to the user (Additional file 1 provides a full User Manual for the operation of QUADrATiC) - \nDefine Signatures\nFig. 3QUADrATiC Workflow and GUI. Step 1: the query signature is defined as a pair of lists of up and down-regulated Affymetrix HG-U133A probe identifiers. Step 2: an analysis run is defined by choosing the query signature, the treatment set type (grouped by drug, or by drug and cell line), and the number of random signatures to be used to estimate the p-value of the connections; the progress bar updates every 30 seconds. Step 3: View the table of significant connections. Step 4: Summary visualizations are available as a bubble chart representing the top connections, or a summary of drugs and cell lines (for treatment sets defined by drug and cell line). Step 5: View a heatmap showing the relative contribution of each probe in the signatureThe user can define, save, edit and delete signatures as lists of up and down-regulated Affymetrix HGU133A probe IDs.Start/Monitor RunsThe user can enter an identifier for an analysis, choose the signature (as defined in 1), the treatment sets to use (grouping by drug across all cell lines or, more usually, by drug and cell line), and set the number of random signatures to be used to estimate the p-value (usually 2000). The analysis can then be started and its progress monitored.Analyze ResultsThe user can view the detailed results of an analysis and perform simple filtering (significant/all, positive/negative/all, simple text filtering) and ordering (ascending/descending Z-Score).Visualize ResultsThis presents a bubble plot of the significant connections, with simple filtering options.Visualize Top30 Drug \documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$\rightarrow $\end{document}→ Cell LineThis is a dynamic and interactive visualization (for treatment sets defined by drug and cell line) of the top 30 connections (negative or positive), showing relationships between the drugs and cell lines.Signature FractionsThis provides a heat map view of the normalized Connection Fractions for up to top 100 connections, matching the specified criteria. The data may be sorted by column (alphabetically) or row (by median value). Row sorting, in particular, is useful when viewing the subset of connections for a particular drug across multiple cell lines, as it can provide further information as to which genes in the signature are affected by the action of that drug. A spreadsheet-readable Comma Separated Variable (CSV) file is also produced for the data in the displayed heatmap, and may be downloaded through the browser."
5908,5908,27143038,23,Results || Application - drug connections for a primary myelofibrosis signature,"In order to demonstrate application of QUADrATiC, it was applied to novel discovery for Myeloproliferative Neoplasms (MPN). In MPN, clonal proliferation of haematopoietic stem cells is often linked with an underlying genetic aberration [25]. In recent years a significant link has been uncovered between these conditions and a range of genes, most notably JAK2 and CALR. As a result, there is an emerging consensus that rather than being separate conditions, the three main types of BCR-ABl negative MPN, Essential Thrombocythaemia (ET), Polycythaemia Vera (PV) and Primary Myelofibrosis (PMF) are stages on a continuum of disease progression [26]. Around 10 % of PV cases (and 5 % in ET) are thought to transform to the more disabling MPN-associated Myelofibrosis (MF) [27], so any drugs which correlate to the reversal of cell states associated with this phenotype may profer a significant benefit to patients. Currently, much effort is being expended in developing JAK inhibitors (such as Ruxolitinib) as an alternative to first-line treatment options for MPN such as Hydroxyurea or Interferon alpha [28, 29].A publically-available dataset was identified, which was originally used to identify up-regulated genes in myelofibrosis [30]. This data series, GSE26049, was sourced from Gene Expression Omnibus (GEO) [31] and downloaded in normalized and background-corrected form. This series consists of whole blood expression data from 91 subjects (19 with Essential Thrombocythemia, 41 with Polycythemia Vera, 9 with Primary Myelofibrosis, 1 with Unclassified Myeloproliferative Disorder, and 21 controls). Using this data, two groups were defined: the group of subjects diagnosed with PV, and the group of subjects with PMF. The expression data for these two groups was extracted and analyzed using the limma package [32] in R to identify the significantly up and down-regulated genes. The R statistical package is freely downloadable software [33] containing many peer reviewed packages that can be used in different biological statistical analyses. The signature was created using those probes with fold change greater than two, and adjusted for a false discovery rate of 0.05, using the Benjamini-Hochberg criteria. The signature and the full set of genes and Affymetrix probe IDs is available in the Additional file 5. This signature was presented to the QUADrATiC software, and the list of significant negative connections (i.e. those which are seen to reverse the phenotype) retrieved. The top connections in the list are analysed for existing or previous use in treatment of myelofibrosis, as detailed in two recent publications [28, 34]. Presenting the signature for myelofibrosis discussed above to QUADrATiC, and calculating connections for treatments sets aggregated by Drug resulted in 899 signficant negative connections to drug/cell line treatment sets. Table 2 shows the connections found (if any) to the combined lists of FDA-approved drugs identified as being in current use for the treatment of myelofibrosis (from [28, 34]). Apart from the compounds identified in Table 2, there are 385 other compounds with significant negative connections to the myelofibrosis signature (of which 188 have negative connections in two or more cell lines). By comparison, analyzing the signature using sscMap results in 412 connections for compounds in more than one cell line, but the majority of those connections are not FDA-approved drugs. Figure 7 shows the top five compounds with negative connections to the PMF signature, along with the five genes having the highest median value of \documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$CF^{*}_{k}$\end{document}CFk∗, across all significant negative treatment sets for the same drug. Of these 5 drugs, 4 are also identified by sscMap, but Pemtrexed is not, as there are no reference profiles for that compound in the original Cmap data set, from which sscMap derives its profiles. \nAmiodarone is typically used to treat cardiac arrhythmias. Its use to treat leukemia has been investigated with some success in mice [35].\nFig. 7Top Five Negative Connections for PMF Signature. The table shows the detail of the top five negative connections found for the PMF signature discussed here. The peripheral figures show the normalized Contribution Fraction heat map for all significant negative connections to treatment sets with for each drug, as determined by QUADrATiCTable 2Existing PMF treatments (from [28, 34]) in significant connection setDrugCommentsCell linesAnagrelideSignificant negative connections foundASC, NPCAzacitidineSignificant negative connections foundVCAP, PC3, MCF7BusulfanNo significant negative connections foundCytarabineNo significant negative connections foundHA1E, PC3, A549, MCF7, VCAP(indicative negative connections found)DanazolSignificant negative connections foundPHH, ASCDecitabineNo significant negative connections foundEpoetin-alphaNo data in LINCS setEverolimusSignificant negative connections foundA549, ASC, HT29HydroxyureaNo data in LINCS setInterferon alphaNo data in LINCS setLenalidomideSignificant negative connections foundSNUC4, A375, COV644MelphalanNo significant negative connections foundHA1E(indicative negative connection found)MercaptopurineSignificant negative connection foundMCF7MethylprednisoloneSignificant negative connection foundA549PomalidomideNo data in LINCS setPrednisoneNo significant negative connections foundRuxolitinibSignificant negative connection foundHEPG2ThalidomideSignificant negative connections foundTYKNU, PHH, CL34, HCC515ThioguanineNo data in LINCS setAmiodarone has been shown to have hematological effects, and interactions with warfarin [36], so this may provide an avenue for its investigation.Pentamidine is an antiprotozoal drug, used to treat fungal infections and pneumonia. It has been investigated for potential anticancer activity, including its use in treatment of chronic myelogenous leukemia [37].One possible mechanism of action of pentamidine could lie in its inhibitory action on S100B [38], which in turn interacts with p53 [39]. There is some evidence that p53 is linked to progression of MPN [40].Azacitidine has, as mentioned earlier been used to treat myelofibrosis in a clinical context.Pemetrexed is an antifolate drug which is used in the treatment of non-small cell lung cancer [41].Pemetrexed has been shown to be an inhibitor of DHFR [42]; another DHFR inhibitor, methotrexate, has recently been shown to inhibit the JAK/STAT pathway, which is a key biological pathway in MPN [43].Fluocinonide is a glucocorticoid used as an anti-inflammatory in the treatment of eczema and other skin disorders.Prednisone, another glucocorticoid, is currently used in the treatment of PMF [34]."
6407,6407,27490093,18,Methods || Context-specific function detection,"We could obtain module similarities between GO terms of functional modules and function vectors by calculating Jaccard index:\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$$ \begin{array}{l}J\left(FM,\kern0.5em FV\right)\\ {}=\frac{\left|FM\cap \kern0.5em FV\right|}{\left|FM\cup \kern0.5em FV\right|}\\ {}FM:\kern0.5em Function\kern0.5em module,\\ {}FV:\kern0.5em Function\kern0.5em vector\end{array} $$\end{document}J(FM,FV)=|FM∩FV||FM∪FV|FM:Functionmodule,FV:Functionvector"
6409,6409,27490093,20,Methods || Regression model,"We made learning tables for performing a multiple linear regression per a drug. First, we found functional modules of cell lines. Second, we found function vectors. Third, we mapped each functional module of each cell line on function vectors. Forth, we calculated a functional module score by:\documentclass[12pt]{minimal}\n\t\t\t\t\usepackage{amsmath}\n\t\t\t\t\usepackage{wasysym} \n\t\t\t\t\usepackage{amsfonts} \n\t\t\t\t\usepackage{amssymb} \n\t\t\t\t\usepackage{amsbsy}\n\t\t\t\t\usepackage{mathrsfs}\n\t\t\t\t\usepackage{upgreek}\n\t\t\t\t\setlength{\oddsidemargin}{-69pt}\n\t\t\t\t\begin{document}$$ \mathrm{Module}\kern0.5em \mathrm{score}=\frac{{\displaystyle \sum F{C}_i}}{N}, $$\end{document}Modulescore=∑FCiN,where FCi represents fold change of gene i and N represents the number of genes in a module."


In [4]:
section_heading_data = []
for section_path_str in selected_articles['section_path'].values:
    if isinstance(section_path_str, str):
        section_path = section_path_str.split(' || ')
        for i in range(len(section_path)):
            my_text = str(section_path[i])
            if my_text == 'Title':
                my_level = 0
            else:
                my_level = i+1
            section_heading_data.append({'text': my_text, 'level': my_level})

headings_df = pd.DataFrame(section_heading_data)

In [6]:
heading_count = Counter(headings_df['text'])

headings_df['heading_count'] = [heading_count[h] for h in headings_df['text']]

show_df_as_scrolling_table(headings_df)

Unnamed: 0,text,level,heading_count
0,Title,0,300
1,Abstract,1,905
2,Abstract,1,905
3,Abstract,1,905
4,THE PROBLEM OF UNDERDOSING,1,5
5,THE PROBLEM OF UNDERDOSING,1,5
6,THE PROBLEM OF UNDERDOSING,1,5
7,THE PROBLEM OF UNDERDOSING,1,5
8,THE PROBLEM OF UNDERDOSING,1,5
9,THE SIGNIFICANCE OF UNDERDOSING,1,1


In [7]:
headings_set = dict( [ (row['text'], row['level']) for row in headings_df.to_dict(orient='records')] )
len(headings_set)  # 2816

unique_headings_df = pd.DataFrame([{'text': heading, 'level': level, 'is_heading':1} for (heading, level) in headings_set.items()])
np.max(unique_headings_df['level'])  # 4
unique_headings_df

Unnamed: 0,text,level,is_heading
0,Title,0,1
1,Abstract,1,1
2,THE PROBLEM OF UNDERDOSING,1,1
3,THE SIGNIFICANCE OF UNDERDOSING,1,1
4,PREVENTION OF UNDERDOSING,1,1
...,...,...,...
2741,Search methods for the identification of studies,2,1
2742,Managing missing data,3,1
2743,Data synthesis,3,1
2744,Assessment of the quality of evidence,3,1


In [8]:
body_text_df = pd.DataFrame([{'text': row['text'], 'level': 6, 'is_heading':0} for row in selected_articles.to_dict(orient='records')])
body_text_df

Unnamed: 0,text,level,is_heading
0,How to calculate the dose of chemotherapy,6,0
1,Body surface area-dosing does not account for ...,6,0
2,"British Journal of Cancer (2002) 86, 1297–1302...",6,0
3,© 2002 Cancer Research UK,6,0
4,Despite the recent advances in anticancer trea...,6,0
...,...,...,...
11522,Funding acquisition: Xu Zhou,6,0
11523,Supervision: Weifeng Zhu,6,0
11524,Methodology: Weifeng Zhu,6,0
11525,"Writing – original draft: Shuqing Li, Jianrong...",6,0


In [9]:
EXCEL_FILE = 'section_heading_data.xlsx'

all_df = pd.concat([unique_headings_df, body_text_df], axis=0)
all_df.to_excel(EXCEL_FILE)