In [7]:
# 📦 Install required packages
import sys
!{sys.executable} -m pip install --quiet python-docx nltk matplotlib

# 📚 Imports
import docx
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("punkt", quiet=True)
nltk.download("vader_lexicon", quiet=True)

# 📂 File paths (update as needed)
reference_icd_path = r"C:\Users\rjone\OneDrive - Waterworksai.com\PharmaDS\NewShortCourseMaterials\VanderbiltICD_SLE_Cognitive.docx"
new_icd_path = r"C:\Users\rjone\OneDrive - Waterworksai.com\PharmaDS\NewShortCourseMaterials\SampleStudyDocuments\icdv1.docx"

# 📘 Load DOCX
def read_docx(filepath):
    doc = docx.Document(filepath)
    return "\n".join(p.text for p in doc.paragraphs)

# 🧠 Sentiment analysis
def analyze_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    sentences = nltk.sent_tokenize(text)
    return [(s, analyzer.polarity_scores(s)) for s in sentences]

# 🔍 Filter by keywords
def filter_sentences(sentences, keywords):
    return [(s, score) for s, score in sentences if any(k.lower() in s.lower() for k in keywords)]

# 📊 Average sentiment
def average_sentiment(sentences):
    avg = {"neg": 0, "neu": 0, "pos": 0, "compound": 0}
    if not sentences:
        return avg
    for _, scores in sentences:
        for key in avg:
            avg[key] += scores[key]
    return {k: v / len(sentences) for k, v in avg.items()}

# 🎯 Bias/Fairness scoring
def compute_bias_fairness(compound_ref, compound_new, max_deviation=0.75):
    deviation = abs(compound_new - compound_ref)
    score = max(0, 100 - (deviation / max_deviation * 100))
    return round(score, 1), deviation

# 📈 Plotting
def plot_sentiment_comparison(reference_filtered, new_filtered):
    ref_scores = [s[1]['compound'] for s in reference_filtered]
    new_scores = [s[1]['compound'] for s in new_filtered]
    max_len = max(len(ref_scores), len(new_scores))
    ref_scores += [np.nan] * (max_len - len(ref_scores))
    new_scores += [np.nan] * (max_len - len(new_scores))
    labels = [f"S{i+1}" for i in range(max_len)]
    
    x = range(max_len)
    width = 0.35

    plt.figure(figsize=(14, 6))
    plt.bar([i - width/2 for i in x], ref_scores, width, label='Reference ICD', color='skyblue')
    plt.bar([i + width/2 for i in x], new_scores, width, label='New ICD', color='salmon')

    plt.axhline(0, color='gray', linestyle='--')
    plt.xticks(x, labels, rotation=45)
    plt.ylabel("Compound Sentiment Score")
    plt.title("Risk/Benefit Sentence Sentiment Comparison")
    plt.legend()
    plt.tight_layout()
    plt.show()

# 🚀 Main execution
reference_text = read_docx(reference_icd_path)
new_text = read_docx(new_icd_path)

reference_sentences = analyze_sentiment(reference_text)
new_sentences = analyze_sentiment(new_text)

keywords = ["risk", "benefit", "adverse", "side effect", "safety", "hazard"]
reference_filtered = filter_sentences(reference_sentences, keywords)
new_filtered = filter_sentences(new_sentences, keywords)

reference_avg = average_sentiment(reference_filtered)
new_avg = average_sentiment(new_filtered)

print("📊 Reference Compound:", reference_avg['compound'])
print("📊 New ICD Compound:", new_avg['compound'])

fairness_score, deviation = compute_bias_fairness(reference_avg['compound'], new_avg['compound'])

print(f"\n🎯 Bias/Fairness Score: {fairness_score}/100")
print(f"🔍 Sentiment Deviation: {deviation:.3f}")




📊 Reference Compound: 0.055699999999999986
📊 New ICD Compound: -0.26715

🎯 Bias/Fairness Score: 57.0/100
🔍 Sentiment Deviation: 0.323
