In [15]:
%pip install docx2txt


Collecting docx2txt
  Using cached docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Using cached docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9
Note: you may need to restart the kernel to use updated packages.


In [1]:
import re
import docx2txt

def count_syllables(word: str) -> int:
    """
    A simple, naive syllable counting function:
    - Converts the word to lowercase.
    - Removes a trailing 'e' to approximate silent 'e'.
    - Counts consecutive vowels (a, e, i, o, u, y) as a single syllable group.
    """
    word = word.lower().strip()
    # Remove trailing punctuation
    word = re.sub(r'[^\w\s]', '', word)
    
    # Remove trailing 'e' if it exists (approximate silent 'e')
    if word.endswith('e'):
        word = word[:-1]
    
    # Count groups of vowels (including 'y')
    vowel_groups = re.findall(r'[aeiouy]+', word)
    
    # Ensure at least one syllable
    return max(1, len(vowel_groups))

def tokenize_sentences(text: str) -> list:
    """
    Tokenize text into sentences by splitting on punctuation that commonly ends sentences.
    This approach is simplistic; consider libraries like nltk for better accuracy.
    """
    sentences = re.split(r'[.!?]+', text)
    # Remove empty or purely whitespace entries
    return [s.strip() for s in sentences if s.strip()]

def tokenize_words(text: str) -> list:
    """
    Tokenize text into words by splitting on whitespace and removing punctuation.
    Again, a simplistic approach—library-based tokenizers are more robust.
    """
    # Split on whitespace
    words = text.split()
    # Remove any leftover punctuation
    words = [re.sub(r'[^\w\s]', '', w) for w in words]
    # Filter out empty words
    return [w for w in words if w]

def flesch_reading_ease(num_words: int, num_sentences: int, num_syllables: int) -> float:
    """
    Computes the Flesch Reading Ease Score using the standard formula:
    
    Flesch Reading Ease = 206.835 – (1.015 × (words/sentences)) – (84.6 × (syllables/words))
    """
    if num_sentences == 0 or num_words == 0:
        return 0.0
    return 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)

def flesch_kincaid_grade(num_words: int, num_sentences: int, num_syllables: int) -> float:
    """
    Computes the Flesch–Kincaid Grade Level using the standard formula:
    
    Flesch–Kincaid Grade Level = 0.39 × (words/sentences) + 11.8 × (syllables/words) – 15.59
    """
    if num_sentences == 0 or num_words == 0:
        return 0.0
    return 0.39 * (num_words / num_sentences) + 11.8 * (num_syllables / num_words) - 15.59

def calculate_flesch_kincaid_scores(text: str) -> dict:
    """
    Given any text, calculate:
      - Number of sentences
      - Number of words
      - Total syllables (approx.)
      - Flesch Reading Ease
      - Flesch–Kincaid Grade Level
    
    Returns a dictionary of these measures.
    """
    sentences = tokenize_sentences(text)
    words = tokenize_words(text)
    
    num_sentences = len(sentences)
    num_words = len(words)
    
    # Count syllables across all words
    total_syllables = sum(count_syllables(word) for word in words)
    
    fre = flesch_reading_ease(num_words, num_sentences, total_syllables)
    fk_grade = flesch_kincaid_grade(num_words, num_sentences, total_syllables)
    
    return {
        "sentences": num_sentences,
        "words": num_words,
        "total_syllables": total_syllables,
        "flesch_reading_ease": round(fre, 2),
        "flesch_kincaid_grade": round(fk_grade, 2)
    }

# -----------------------------
# Example usage with a .docx file
if __name__ == "__main__":
    # Replace this path with your own file location:
    file_path = r"C:\Users\rjone\OneDrive - Waterworksai.com\PharmaDS\NewShortCourseMaterials\SampleStudyDocuments\icdv1.docx"
    
    # Extract text from the .docx file
    icd_text = docx2txt.process(file_path)
    
    # Calculate Flesch–Kincaid scores
    scores = calculate_flesch_kincaid_scores(icd_text)
    
    print("Flesch-Kincaid Scores:")
    print(f"  Number of Sentences:  {scores['sentences']}")
    print(f"  Number of Words:      {scores['words']}")
    print(f"  Total Syllables:      {scores['total_syllables']}")
    print(f"  Flesch Reading Ease:  {scores['flesch_reading_ease']}")
    print(f"  Flesch-Kincaid Grade: {scores['flesch_kincaid_grade']}")


Flesch-Kincaid Scores:
  Number of Sentences:  12
  Number of Words:      256
  Total Syllables:      484
  Flesch Reading Ease:  25.23
  Flesch-Kincaid Grade: 15.04
