# DSM PDF to Text

In order to vectorize the data in the DSM, we're going to extra the data into markdown and trying to chunk it in a meaningful way.

## Install Libraries


In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Extra DSM.pdf

Extract the PDF text into plain text. It would be better to get a real database copy of this if it exists

In [2]:
if False:
    import pymupdf4llm
    plain_text = pymupdf4llm.to_markdown("./datasets/DSM-5-TR.pdf")

    with open("./datasets/dsm.md", "w", encoding='utf-8') as f:
        f.write(plain_text)


In [3]:
content = None
with open('./datasets/dsm.md', 'r', encoding='utf-8') as file:
    content = file.read()

## Markdown to HTML
Doing this gives us a better ablity to "query" the document

In [4]:
from markdown import markdown
from bs4 import BeautifulSoup
import hashlib

# Convert markdown to html so we can query it
html = markdown(content)

# hash_object = hashlib.md5(b'Hello World')
# print(hash_object.hexdigest())

def extract_section(html, level: str, section_title: str):
    soup = BeautifulSoup(html, "html.parser")
    for h3 in soup.find_all(level):
        if h3.get_text().strip() == section_title:
            section = []
            next_node = h3.next_sibling
            while next_node and next_node.name not in [level]:
                section.append(str(next_node))
                next_node = next_node.next_sibling
            return "".join(section).strip()
    return None

def extract(html, level: str) -> dict:
    soup = BeautifulSoup(html, "html.parser")
    sections = []
    for h3 in soup.find_all(level):
        section = []
        s = {}
        s["title"] = h3.get_text()
        s["id"] = hashlib.md5(h3.get_text().encode('utf-8')).hexdigest()
        next_node = h3.next_sibling
        while next_node and next_node.name not in [level]:
            section.append(str(next_node))
            next_node = next_node.next_sibling
        s["section"] = "".join(section).strip()
        sections.append(s)
    return sections

## Extract the overall section, this will have subsections we may be
## intrested in

# section_html = extract_disorder_section(html, "h3", "Autism Spectrum Disorder")
# section_html = extract_disorder_section(html, "h3", "Anxiety Disorders")
# section_html = extract_disorder_section(html, "h3", "Bipolar and Related Disorders")
# section_html = extract_section(html, "h3", "Depressive Disorders")
# sub_section_html = extract_section(section_html, "h4", "Unspecified Depressive Disorder")
# section_html = extract_disorder_section(html, "h3", "Schizophrenia Spectrum and Other Psychotic Disorders")

# print(BeautifulSoup(section_html).get_text())
# print(".................")
# print(BeautifulSoup(sub_section_html).get_text())
#  display(BeautifulSoup(section_html).find_all("h4"))

In [5]:
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')
# stop_words = set(stopwords.words('english'))
# tokens = word_tokenize(text.lower())
# filtered_tokens = [word for word in tokens if word not in stop_words]
# print("Filtered:", filtered_tokens)

In [7]:
import re
import json
from markdownify import markdownify as md

section_ii = extract_section(html, "h1", "SECTION II")
disorders = extract(section_ii, "h3")

for i,d in enumerate(disorders):
    if i <= 1:
        continue
    d["section"] = re.sub(r'<p>[0-9]+<\/p>', r'', d["section"])
    d["section"] = d["section"].replace("“","\"")
    d["section"] = d["section"].replace("”","\"")
    d["section"] = d["section"].replace("’","'")
    d["section"] = d["section"].replace("–","-")
    d["section"] = md(d["section"])

    with open(f'./sections/{d["id"]}.json', 'w', encoding='utf-8') as file:
        print(f'{d["title"]} :: {len(d["section"])}')
        file.write(json.dumps(d))

# with open('./datasets/section_ii.html', 'w', encoding='utf-8') as file:
#     section_ii = re.sub(r'<p>[0-9]+<\/p>', r'', section_ii)
#     section_ii = section_ii.replace("“","\"")
#     section_ii = section_ii.replace("”","\"")
#     section_ii = section_ii.replace("’","'")
#     section_ii = section_ii.replace("–","-")
#     file.write(section_ii)

# print(section_ii)
# section_html = extract_section(section_ii, "h4", "Major Depressive Disorder")
# depressive_id = hashlib.md5(b'Major Depressive Disorder')
# print(section_html)

# sub_section_html = extract_section(section_html, "h6", "Diagnostic Criteria")
# Associated Features
# Comorbidity

# text = BeautifulSoup(sub_section_html).get_text().replace("\n", " ")
# print(depressive_id.hexdigest())
# print(text)

# with open(f"{depressive_id.hexdigest()}.txt", "w") as f:
#     f.write(text)

Intellectual Developmental Disorders :: 28318
Communication Disorders :: 33350
Autism Spectrum Disorder :: 45567
Attention-Deficit/Hyperactivity Disorder :: 30609
Specific Learning Disorder :: 36862
Motor Disorders :: 48192
Other Neurodevelo mental Disorders p :: 1905
Schizophrenia Spectrum and Other Psychotic Disorders :: 123638
Catatonia :: 13071
Bipolar and Related Disorders :: 135175
Depressive Disorders :: 139932
Anxiety Disorders :: 177603
Obsessive-Compulsive and Related Disorders :: 118533
Trauma- and Stressor-Related Disorders :: 124770
Dissociative Disorders :: 76433
Somatic Symptom and Related Disorders :: 79040
Feeding and Eating Disorders :: 96282
Elimination Disorders :: 22951
Sleep-Wake Disorders :: 83929
Breathing-Related Sleep Disorders :: 77701
Parasomnias :: 89363
Sexual Dysfunctions :: 116011
Gender Dysphoria :: 38383
Disruptive, Impulse-Control, and Conduct Disorders :: 491096
Neurocognitive Disorders :: 222091
Personality Disorders :: 17569
Cluster A Personality D

"Diagnostic Criteria"

In [8]:
import pandas as pd
import numpy as np
import mq
print(len(content))
display(mq.run('.h1', content, None).values)

3667715


['# DIAGNOSTIC AND STATISTICAL MANUAL OF MENTAL DISORDERS',
 '# DSM-5-TR™',
 '# DIAGNOSTIC AND STATISTICAL MANUAL OF MENTAL DISORDERS',
 '# DSM-5-TR™',
 '# **SECTION I**',
 '# **SECTION II**',
 '# **SECTION III**',
 '# **APPENDIX**']