In [14]:
import docx
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

######################################
# 1. Utility: Read .docx content
######################################
def read_docx(filepath: str) -> str:
    """
    Reads a .docx file from the given filepath and returns its full text.
    """
    doc = docx.Document(filepath)
    return "\n".join(p.text for p in doc.paragraphs)

######################################
# 2. Sentiment Analysis Functions
######################################
def analyze_sentiment(text: str):
    """
    Splits the text into sentences and computes sentiment scores for each sentence.
    Returns a list of tuples: (sentence, sentiment_scores).
    """
    # Download necessary NLTK resources
    nltk.download('punkt', quiet=True)
    nltk.download('vader_lexicon', quiet=True)
    
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    analyzer = SentimentIntensityAnalyzer()
    
    results = []
    for sentence in sentences:
        scores = analyzer.polarity_scores(sentence)
        results.append((sentence, scores))
    return results

def filter_sentences_by_keywords(sentences_with_scores, keywords):
    """
    Filters a list of (sentence, scores) tuples to only those sentences that
    contain at least one of the provided keywords.
    """
    filtered = []
    for sentence, scores in sentences_with_scores:
        if any(keyword.lower() in sentence.lower() for keyword in keywords):
            filtered.append((sentence, scores))
    return filtered

def average_sentiment(sentences_with_scores):
    """
    Computes average sentiment scores from a list of (sentence, scores) tuples.
    """
    avg = {"neg": 0, "neu": 0, "pos": 0, "compound": 0}
    count = len(sentences_with_scores)
    if count == 0:
        return avg
    for _, scores in sentences_with_scores:
        for key in avg.keys():
            avg[key] += scores[key]
    return {key: value / count for key, value in avg.items()}

def main():
    ...
    # [your existing sentiment analysis code here]
    
    # Return results needed for plotting
    return reference_filtered, new_filtered

######################################
# 3. Main Comparison Logic
######################################
def main():
    # Define paths for the two ICD documents
    new_icd_path = r"C:\Users\rjone\OneDrive - Waterworksai.com\PharmaDS\NewShortCourseMaterials\SampleStudyDocuments\icdv1.docx"
    reference_icd_path = r"C:\Users\rjone\OneDrive - Waterworksai.com\PharmaDS\NewShortCourseMaterials\VanderbiltICD_SLE_Cognitive.docx"

    
    # Read the documents
    reference_icd_text = read_docx(reference_icd_path)
    new_icd_text = read_docx(new_icd_path)
    
    # Perform sentence-level sentiment analysis
    reference_sentences = analyze_sentiment(reference_icd_text)
    new_sentences = analyze_sentiment(new_icd_text)
    
    # Define keywords related to risk and benefit language
    risk_benefit_keywords = ["risk", "benefit", "adverse", "side effect", "safety", "hazard"]
    
    # Filter sentences for risk/benefit–related language
    reference_filtered = filter_sentences_by_keywords(reference_sentences, risk_benefit_keywords)
    new_filtered = filter_sentences_by_keywords(new_sentences, risk_benefit_keywords)
    
    # Compute average sentiment scores for the filtered sentences (focus on risk/benefit content)
    reference_avg = average_sentiment(reference_filtered)
    new_avg = average_sentiment(new_filtered)
    
    print("=== Reference ICD (Risk/Benefit-related sentences) ===")
    for sentence, scores in reference_filtered:
        print("Sentence:", sentence)
        print("Sentiment Scores:", scores)
        print()
    print("Average Sentiment Scores for Reference ICD (filtered):", reference_avg)
    
    print("\n=== New Sponsor ICD (Risk/Benefit-related sentences) ===")
    for sentence, scores in new_filtered:
        print("Sentence:", sentence)
        print("Sentiment Scores:", scores)
        print()
    print("Average Sentiment Scores for New Sponsor ICD (filtered):", new_avg)
    
    # Optionally, compute overall sentiment scores (across all sentences)
    reference_overall_avg = average_sentiment(reference_sentences)
    new_overall_avg = average_sentiment(new_sentences)
    
    print("\n=== Overall Sentiment Scores ===")
    print("Reference ICD Overall:", reference_overall_avg)
    print("New Sponsor ICD Overall:", new_overall_avg)

if __name__ == "__main__":
    main()





=== Reference ICD (Risk/Benefit-related sentences) ===
Sentence: If we learn something new that may affect the risks or benefits of this study, you will be told so that you can decide whether or not you still want to be in this study.
Sentiment Scores: {'neg': 0.088, 'neu': 0.844, 'pos': 0.069, 'compound': 0.0716}

Sentence: The purpose of this study is to evaluate the safety and effectiveness of the drug memantine in treating the symptoms of NPSLE over the course of three months.
Sentiment Scores: {'neg': 0.0, 'neu': 0.906, 'pos': 0.094, 'compound': 0.4215}

Sentence: This information will help to better identify patients who may benefit from memantine treatment.
Sentiment Scores: {'neg': 0.0, 'neu': 0.561, 'pos': 0.439, 'compound': 0.8225}

Sentence: At each of the three study visits, blood samples will be taken to measure laboratory values that are important for monitoring your safety.
Sentiment Scores: {'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'compound': 0.743}

Sentence: Urine sam