# SOAP NOTE NLP Analysis
1. Form Dataset of the three different Doctor Notes
2. Analyze for ...
   1. Readability (Flesh Reading Ease)
   2. Structure (POS-Ratio, Length)
   3. Emotion (Sentiment Analysis)
   4. Domain (Medical Term Density)

### #1 Data Preparation

In [None]:
import pandas as pd
# Example structure: soap_merged = pd.read_json('path_to_your_JSON.json')

soap_merged = pd.DataFrame({
    'ID': [],
    'SOAP_Text': [],
    'Group': [],  #1 - neutral, 2 - good, 3 - bad
    'SOAP_Subjective': [],
    'SOAP_Objective': [],
    'SOAP_Assessment': [],
    'SOAP_Plan': []
})

### #2 Analysis

In [1]:
# READABILITY
!pip install py-readability-metrics
!python -m nltk.downloader punkt

from readability import Readability

def compute_readability(text):
    r = Readability(str(text))
    fk = r.flesch_kincaid()
    smog = r.smog()
    return pd.Series({
        'fk_score': fk.score,
        'fk_grade': fk.grade_level,
        'smog_score': smog.score,
        'smog_grade': smog.grade_level
    })

readability_df = soap_merged['SOAP_Text'].apply(compute_readability)
soap_merged = pd.concat([soap_merged, readability_df], axis=1)

In [None]:
# LENGTH, POS ANALYSIS & SENTIMENT
!pip install nltk
import nltk
from nltk import word_tokenize, pos_tag
from textblob import TextBlob

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def analyze_text_features(text):
    tokens = word_tokenize(str(text))
    length = len(tokens) #ength

    tags = pos_tag(tokens)
    pos_counter = nltk.FreqDist(tag for word, tag in tags) # POS Count

    sentiment = TextBlob(str(text)).sentiment.polarity #sentiment

    return pd.Series({
        'length': length,
        'sentiment': sentiment,
        **pos_counter  # each POS tag becomes a column
    })

pos_sentiment_df = soap_merged['SOAP_Text'].apply(analyze_text_features)
soap_merged = pd.concat([soap_merged, pos_sentiment_df], axis=1)

In [None]:
# MEDICAL TERM DENSITY

!pip install pymetamap

from nltk.stem import WordNetLemmatizer
from pymetamap import MetaMap

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = nltk.word_tokenize(str(text))
    return [lemmatizer.lemmatize(t.lower()) for t in tokens]

# Provide path to your MetaMap Lite installation
# mm = MetaMap.get_instance('/path/to/metamap/lite')

def compute_medical_density(text, allowed_semtypes=['Disease or Syndrome', 'Finding', 'Procedure']):
    lemmas = " ".join(lemmatize_text(text))
    concepts = mm.extract_concepts([lemmas])[0]  # list of concept objects

    # Filter by semantic type
    filtered = [c for c in concepts if any(st in allowed_semtypes for st in c.semtypes)]
    num_tokens = len(lemmas.split())
    if num_tokens == 0:
        return 0
    return len(filtered) / num_tokens

# soap_merged['medical_density'] = soap_merged['SOAP_Text'].apply(compute_medical_density)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x='Group', y='sentiment', data=soap_merged)
plt.title("Sentiment by SOAP Group")
plt.show()

sns.boxplot(x='Group', y='length', data=soap_merged)
plt.title("Length by SOAP Group")
plt.show()

In [None]:
# CLASSIFICATION (3 Clusters)
# Can it distinguish based on the SOAP Note generation, from which Doctor the Note came?
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Feature selection - include POS counts, readability, sentiment, length, and medical density
feature_cols = ['length', 'sentiment', 'fk_score', 'smog_score']  # extend with POS counts and medical_density when available

X = soap_merged[feature_cols]
y = soap_merged['Group']  # or 'Doctor_Type'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))