In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.2-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.1 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.2 PyMuPDFb-1.24.1


**html**

In [None]:
from operator import itemgetter
import fitz
import json


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool

    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict

    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para


def main():

    document = "/content/drive/MyDrive/s12874-020-01076-x.pdf"
    doc = fitz.open(document)

    font_counts, styles = fonts(doc, granularity=False)

    size_tag = font_tags(font_counts, styles)

    elements = headers_para(doc, size_tag)

    with open("doc.json", 'w') as json_out:
        json.dump(elements, json_out)


if __name__ == '__main__':
    main()


**json file**

In [None]:
import json

# Load the content of the JSON file
with open("doc.json", 'r') as json_file:
    content = json.load(json_file)

# Print the content
for item in content:
    print(item)


<h2>RESEARCH ARTICLE| Open Access|

<h1>Quality of evidence in a post-Soviet| country: evaluation of methodological| quality of controlled clinical trials published| in national journals from Uzbekistan|

<h3>Timur Aripov
<s4>1*|
<h3>, Dilfuza Aniyozova
<s4>2
<h3>and Irina Gorbunova
<s4>1|

<h4>Abstract|

<h5>Background:  Most researchers in Uzbekistan prefer to publish their reports in journals of their home country.| Moreover, the proportion of healthcare practitioners who prefer to use these national sources of information also| remains high. However, the quality of publications from national journals, in post-Soviet countries, has not been| systematically evaluated until now. The primary objective of this study was to evaluate the quality of randomized| controlled trials ’  (RCTs) reports published in medical journals from Uzbekistan. We supposed that reports had at| least minimal quality to contribute to the higher quality of healthcare.|
<h5>Methods:  To evaluate the quality of R

**h and s tag**

In [None]:
import json

# Load the content of the JSON file
with open("doc.json", 'r') as json_file:
    content = json.load(json_file)

# Extract headers and subscripts
headers_subscripts = [item for item in content if "<h" in item or "<s" in item]

# Print the extracted headers and subscripts
for item in headers_subscripts:
    print(item)


<h2>RESEARCH ARTICLE| Open Access|
<h1>Quality of evidence in a post-Soviet| country: evaluation of methodological| quality of controlled clinical trials published| in national journals from Uzbekistan|
<h3>Timur Aripov
<s4>1*|
<h3>, Dilfuza Aniyozova
<s4>2
<h3>and Irina Gorbunova
<s4>1|
<h4>Abstract|
<h5>Background:  Most researchers in Uzbekistan prefer to publish their reports in journals of their home country.| Moreover, the proportion of healthcare practitioners who prefer to use these national sources of information also| remains high. However, the quality of publications from national journals, in post-Soviet countries, has not been| systematically evaluated until now. The primary objective of this study was to evaluate the quality of randomized| controlled trials ’  (RCTs) reports published in medical journals from Uzbekistan. We supposed that reports had at| least minimal quality to contribute to the higher quality of healthcare.|
<h5>Methods:  To evaluate the quality of RCTs,

**filter using keyword**

In [None]:
import json

# Load the content of the JSON file
with open("doc.json", 'r') as json_file:
    content = json.load(json_file)

# Define the keywords
keywords = ["Introduction", "Abstract", "Result", "Background", "Conclusion", "Method", "Methodology",
            "Materials and methods", "Results and discussion", "Discussion"]

# Extract headers and subscripts containing keywords and excluding <p> tags
filtered_items = []
for item in content:
    if "<p>" not in item:  # Exclude items containing <p> tags
        for keyword in keywords:
            if keyword.lower() in item.lower():
                filtered_items.append(item)
                break  # Move to the next item once a keyword is found

# Print the filtered items
for item in filtered_items:
    print(item)


<h1>Quality of evidence in a post-Soviet| country: evaluation of methodological| quality of controlled clinical trials published| in national journals from Uzbekistan|
<h4>Abstract|
<h5>Background:  Most researchers in Uzbekistan prefer to publish their reports in journals of their home country.| Moreover, the proportion of healthcare practitioners who prefer to use these national sources of information also| remains high. However, the quality of publications from national journals, in post-Soviet countries, has not been| systematically evaluated until now. The primary objective of this study was to evaluate the quality of randomized| controlled trials ’  (RCTs) reports published in medical journals from Uzbekistan. We supposed that reports had at| least minimal quality to contribute to the higher quality of healthcare.|
<h5>Methods:  To evaluate the quality of RCTs, we selected two journals from the list of national medical journals for| which background information was provided. We d

In [None]:
import json

# Load the content of the JSON file
with open("doc.json", 'r') as json_file:
    content = json.load(json_file)

# Define the keywords
keywords = ["Introduction", "Abstract", "Result", "Background", "Conclusion", "Method", "Methodology",
            "Materials and methods", "Results and discussion", "Discussion"]

# Extract headers containing keywords and excluding <p> and <s> tags
filtered_items = []
for item in content:
    if "<p>" not in item and "<s>" not in item:  # Exclude items containing <p> or <s> tags
        for keyword in keywords:
            if keyword.lower() in item.lower():
                filtered_items.append(item.strip())
                break  # Move to the next item once a keyword is found

# Print the filtered items
for item in filtered_items:
    print(item)


<h1>Quality of evidence in a post-Soviet| country: evaluation of methodological| quality of controlled clinical trials published| in national journals from Uzbekistan|
<h4>Abstract|
<h5>Background:  Most researchers in Uzbekistan prefer to publish their reports in journals of their home country.| Moreover, the proportion of healthcare practitioners who prefer to use these national sources of information also| remains high. However, the quality of publications from national journals, in post-Soviet countries, has not been| systematically evaluated until now. The primary objective of this study was to evaluate the quality of randomized| controlled trials ’  (RCTs) reports published in medical journals from Uzbekistan. We supposed that reports had at| least minimal quality to contribute to the higher quality of healthcare.|
<h5>Methods:  To evaluate the quality of RCTs, we selected two journals from the list of national medical journals for| which background information was provided. We d

In [None]:
import json

# Load the content of the JSON file
with open("doc.json", 'r') as json_file:
    content = json.load(json_file)

# Define the keywords
keywords = ["Introduction", "Abstract", "Result", "Background", "Conclusion", "Method", "Methodology",
            "Materials and methods", "Results and discussion", "Discussion"]

# Extract headers containing keywords and excluding <p> and <s> tags
filtered_items = []
for item in content:
    if "<p>" not in item and "<s>" not in item:  # Exclude items containing <p> or <s> tags
        if any(keyword.lower() in item.lower() for keyword in keywords):
            # Remove HTML tags and append to filtered_items
            filtered_item = remove_html_tags(item)
            filtered_items.append(filtered_item.strip())

# Function to remove HTML tags
def remove_html_tags(text):
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Print the filtered items
for item in filtered_items:
    print(item)


NameError: name 'remove_html_tags' is not defined

In [None]:
import json
import re

# Function to remove HTML tags
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Load the content of the JSON file
with open("doc.json", 'r') as json_file:
    content = json.load(json_file)

# Define the keywords
keywords = ["Introduction", "Abstract", "Result", "Background", "Conclusion", "Method", "Methodology",
            "Materials and methods", "Results and discussion", "Discussion"]

# Extract headers containing keywords and excluding <p> and <s> tags
filtered_items = []
for item in content:
    if "<p>" not in item and "<s>" not in item:  # Exclude items containing <p> or <s> tags
        if any(keyword.lower() in item.lower() for keyword in keywords):
            # Remove HTML tags and append to filtered_items
            filtered_item = remove_html_tags(item)
            filtered_items.append(filtered_item.strip())

# Print the filtered items
for item in filtered_items:
    print(item)


Quality of evidence in a post-Soviet| country: evaluation of methodological| quality of controlled clinical trials published| in national journals from Uzbekistan|
Abstract|
Background:  Most researchers in Uzbekistan prefer to publish their reports in journals of their home country.| Moreover, the proportion of healthcare practitioners who prefer to use these national sources of information also| remains high. However, the quality of publications from national journals, in post-Soviet countries, has not been| systematically evaluated until now. The primary objective of this study was to evaluate the quality of randomized| controlled trials ’  (RCTs) reports published in medical journals from Uzbekistan. We supposed that reports had at| least minimal quality to contribute to the higher quality of healthcare.|
Methods:  To evaluate the quality of RCTs, we selected two journals from the list of national medical journals for| which background information was provided. We decided to select