In [1]:
from bs4 import BeautifulSoup
import markdownify


def extract_text_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, "lxml")

    # Strip these tags out
    [x.extract() for x in soup.findAll(["style", "script"])]

    # Use a single line of html text for the markdown parser
    html_content = str(soup).replace("\n", "")

    return markdownify.markdownify(html_content, heading_style="ATX")


text = extract_text_from_html("./raw_syllabi/2021-12_532.html")

with open(f"./parsed_syllabi/2021-12_532.md", "w", encoding="utf-8") as file:
    file.write(text)

In [2]:
import fitz  # PyMuPDF


def extract_text_from_pdf(file_path):
    with fitz.open(file_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()

    return text

In [11]:
# from pdfminer.high_level import extract_text


# def parse_pdf(file_path):
#     text = extract_text(file_path)
#     lines = text.split("\\\\n")
#     for i, line in enumerate(lines):
#         stripped = line.strip()
#         if stripped.isupper() and len(stripped) < 50:
#             lines[i] = f"## {stripped}"
#     return "\\\\n".join(lines)

# parse_pdf("./raw_syllabi/2023-01_652.pdf")

# with open(f"./parsed_syllabi/2021-12_532.md", "w", encoding="utf-8") as file:
#     file.write(text)

'Course Syllabus for SIADS 652: Network Analysis \n\nCourse Overview and Prerequisites \nThis course will introduce students to basic network analysis techniques, emphasizing developing programming skills \nto manipulate and analyze real network data using Python. The course includes topics such as network evolution, link \nprediction, network centrality, models of information diffusion on networks, and community structure.   \n\nThe prerequisites for SIADS 652 are:  \n\n●  SIADS 542 Supervised Learning \n●  SIADS 505 Data Manipulation \n●  SIADS 502 Math Methods for Data Science \n\nInstructors \n\nDaniel M Romero \nAssociate Professor  \nSchool of Information \nComplex Systems \nComputer Science \nEmail: drom@umich.edu \n\nCory Bilyeu \nLecturer \nSchool of Information \nEmail: cbilyeu@umich.edu \n\nYou Wu \nMaster’s Student \nSchool of Information \nEmail: uvuuview@umich.edu \n\nCourse Communication Expectations \n\nWe will use Slack for most communication related to the class. We w

In [3]:
course_records = [
    ("2021-09", "505"),
    ("2021-10", "502"),
    ("2021-10", "515"),
    ("2021-11", "521"), # This one is listed under week 1 and doesn't have a reference in "resources"
    ("2021-12", "532"),
    # ("2022-01", "501"), # pdf
    ("2022-01", "511"),
    ("2022-02", "522"),
    ("2022-03", "503"),
    ("2022-04", "523"),
    ("2022-05", "542"), # This one is listed under week 1 materials
    ("2022-06", "543"), # This one is listed under week 1 materials
    ("2022-06", "611"),
    ("2022-07", "516"),
    ("2022-08", "622"),
    ("2022-09", "593"),
    ("2022-11", "631"),
    # ("2023-01", "652"), # pdf
    ("2023-02", "632"),
    # ("2023-02", "673"), # This was just a link to a Google Doc, which I downloaded as a pdf
    ("2023-03", "642"),
    # ("2023-03", "643"), # This was just a link to a Google Doc, which I downloaded as a pdf
    ("2023-04", "655"), # This one is listed under week 1 materials
    ("2023-05", "524"),
    ("2023-06", "601"),
    ("2023-06", "630"),
    # ("2023-09", "696"), # This is a pdf on in a Google Drive folder that I no longer have access to, luckily I kept a copy that I downloaded during the class
    ("2023-11", "682"),
    ("2023-12", "688"),
    # ("2024-02", "699"), # This was just a link to a Google Doc, which I downloaded as a pdf
]

courses = [f"{date}_{num}" for date, num in course_records]

In [4]:
import os


def extract_text(file_identifier, base_path="./raw_syllabi/"):
    html_path = os.path.join(base_path, f"{file_identifier}.html")
    # pdf_path = os.path.join(base_path, f"{file_identifier}.pdf")

    if os.path.exists(html_path):
        return extract_text_from_html(html_path)
    # elif os.path.exists(pdf_path):
        # return extract_text_from_pdf(pdf_path)
    else:
        return f"Course Syllabus does not exist at: {base_path}{file_identifier}"

In [5]:
for course in courses:
    text = extract_text(course)

    with open(f"./parsed_syllabi/{course}.md", "w", encoding="utf-8") as file:
        file.write(text)

In [6]:
!npx prettier ./parsed_syllabi/ --write

[K[?25h░░░░⠂⠂⠂⠂⠂⠂⸩ ⠼ build:prettier: sill doSerial build 8[0m[K/.npm/_locks[0m[K[0m[Knpx: installed 1 in 1.444s
parsed_syllabi/2021-09_505.md[2K[1Gparsed_syllabi/2021-09_505.md 94ms
parsed_syllabi/2021-09_505.txt[2K[1Gparsed_syllabi/2021-10_502.md[2K[1Gparsed_syllabi/2021-10_502.md 75ms
parsed_syllabi/2021-10_502.txt[2K[1Gparsed_syllabi/2021-10_515.md[2K[1Gparsed_syllabi/2021-10_515.md 58ms
parsed_syllabi/2021-10_515.txt[2K[1Gparsed_syllabi/2021-11_521.md[2K[1Gparsed_syllabi/2021-11_521.md 37ms
parsed_syllabi/2021-11_521.txt[2K[1Gparsed_syllabi/2021-12_532.md[2K[1Gparsed_syllabi/2021-12_532.md 48ms
parsed_syllabi/2021-12_532.txt[2K[1Gparsed_syllabi/2022-01_501.md[2K[1Gparsed_syllabi/2022-01_501.md 39ms
parsed_syllabi/2022-01_501.txt[2K[1Gparsed_syllabi/2022-01_511.md[2K[1Gparsed_syllabi/2022-01_511.md 41ms
parsed_syllabi/2022-01_511.txt[2K[1Gparsed_syllabi/2022-02_522.md[2K[1Gparsed_syllabi/2022-02_522.md 41ms
parsed_syllabi/2022-02_522.txt[2K[1Gp

In [14]:
pdf_course_syllabi = [
    ("2022-01", "501"), # pdf
    ("2023-01", "652"), # pdf
    ("2023-02", "673"), # This was just a link to a Google Doc, which I downloaded as a pdf
    ("2023-03", "643"), # This was just a link to a Google Doc, which I downloaded as a pdf
    ("2023-09", "696"), # This is a pdf on in a Google Drive folder that I no longer have access to, luckily I kept a copy that I downloaded during the class
    ("2024-02", "699"), # This was just a link to a Google Doc, which I downloaded as a pdf
]

pdf_syllabi = [f"./../raw_syllabi/{date}_{num}.pdf" for date, num in pdf_course_syllabi]

pdf_syllabi

['./../raw_syllabi/2022-01_501.pdf',
 './../raw_syllabi/2023-01_652.pdf',
 './../raw_syllabi/2023-02_673.pdf',
 './../raw_syllabi/2023-03_643.pdf',
 './../raw_syllabi/2023-09_696.pdf',
 './../raw_syllabi/2024-02_699.pdf']

PDF conversion to Markdown proved to be difficult. It's not realistic to build out a script in this notebook for the 6 files that we have to convert. This Python app, "marker" was installed and used on these files, then they were validated for content manually before being formatted with "prettier". Full setup and usage instructions at: https://github.com/VikParuchuri/marker

# Marker

Marker converts PDF, EPUB, and MOBI to markdown.

## Mac

- Install system requirements from `scripts/install/brew-requirements.txt`
- Set the tesseract data folder path
  - Find the tesseract data folder `tessdata` with `brew list tesseract`
  - Create a `local.env` file in the root `marker` folder with `TESSDATA_PREFIX=/path/to/tessdata` inside it
- Install python requirements
  - `poetry install`
  - `poetry shell` to activate your poetry venv

# Usage

First, some configuration.  Note that settings can be overridden with env vars, or in a `local.env` file in the root `marker` folder.

- Your torch device will be automatically detected, but you can manually set it also.  For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. `cpu` is the default.
  - If using GPU, set `INFERENCE_RAM` to your GPU VRAM (per GPU).  For example, if you have 16 GB of VRAM, set `INFERENCE_RAM=16`.
  - Depending on your document types, marker's average memory usage per task can vary slightly.  You can configure `VRAM_PER_TASK` to adjust this if you notice tasks failing with GPU out of memory errors.
- Inspect the other settings in `marker/settings.py`.  You can override any settings in the `local.env` file, or by setting environment variables.
  - By default, the final editor model is off.  Turn it on with `ENABLE_EDITOR_MODEL=true`.
  - By default, marker will use ocrmypdf for OCR, which is slower than base tesseract, but higher quality.  You can change this with the `OCR_ENGINE` setting.

## Convert a single file

Run `convert_single.py`, like this:

```
python convert_single.py /path/to/file.pdf /path/to/output.md --parallel_factor 2 --max_pages 10
```

- `--parallel_factor` is how much to increase batch size and parallel OCR workers by.  Higher numbers will take more VRAM and CPU, but process faster.  Set to 1 by default.
- `--max_pages` is the maximum number of pages to process.  Omit this to convert the entire document.

Make sure the `DEFAULT_LANG` setting is set appropriately for your document.

## Convert multiple files

Run `convert.py`, like this:

```
python convert.py /path/to/input/folder /path/to/output/folder --workers 10 --max 10 --metadata_file /path/to/metadata.json --min_length 10000
```

- `--workers` is the number of pdfs to convert at once.  This is set to 1 by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Parallelism will not increase beyond `INFERENCE_RAM / VRAM_PER_TASK` if you're using GPU.
- `--max` is the maximum number of pdfs to convert.  Omit this to convert all pdfs in the folder.
- `--metadata_file` is an optional path to a json file with metadata about the pdfs.  If you provide it, it will be used to set the language for each pdf.  If not, `DEFAULT_LANG` will be used. The format is:
- `--min_length` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing.  If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)