In [2]:
import sys
import os
sys.path.append(os.path.abspath('../'))

In [3]:
from src import load_data, InsomniaClassifier, convert_output_to_json

val_path = "/home/ubuntu/SSM/validation/combined_insomnia_data_validation.csv"
df = load_data(val_path)
clinical_notes = df['text'].tolist()
classifier = InsomniaClassifier()

classification_results = []
extracted_texts = []

Successfully loaded 20 records from /home/ubuntu/SSM/validation/combined_insomnia_data_validation.csv


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded on cuda


In [4]:
df.head()

Unnamed: 0,note_id,text,Definition 1,Definition 2,Rule A,Rule B,Rule C,Insomnia
0,1280230,female patient in sixties prescribed no drugs\...,yes,no,no,no,no,no
1,1286616,male patient in fifties prescribed no drugs\n\...,yes,yes,yes,no,yes,yes
2,1291281,female patient in seventies prescribed no drug...,no,no,no,no,no,no
3,1293851,female patient in seventies prescribed Nitrogl...,yes,yes,yes,yes,yes,yes
4,1295870,male patient in seventies prescribed Pantopraz...,yes,yes,yes,no,yes,yes


In [6]:

# Process each clinical note with exception handling
for idx, clinical_note in enumerate(clinical_notes):
    try:
        classification, extracted = classifier.classify(clinical_note)
        classification_results.append(classification)
        extracted_texts.append(extracted)
    except RuntimeError as e:
        print(f"RuntimeError for text at index {idx}: {e}")
        # Append default classifications and empty extracted text on error
        classification_results.append({
            "Definition 1 (Sleep Difficulty)": "no",
            "Definition 2 (Daytime Impairment)": "no",
            "Rule A (Insomnia Diagnosis)": "no",
            "Rule B (Primary Medications)": "no",
            "Rule C (Secondary Medications)": "no",
            "Final Insomnia Status": "no"
        })
        extracted_texts.append({
            "Definition 1 Extracted": "",
            "Definition 2 Extracted": "",
            "Rule A Extracted": "",
            "Rule B Extracted": "",
            "Rule C Extracted": ""
        })

In [7]:
import pandas as pd
# Convert results to DataFrames
df_classification = pd.DataFrame(classification_results)
df_extracted = pd.DataFrame(extracted_texts)

# Combine all DataFrames
df_final = pd.concat([df[['text', 'note_id']], df_classification, df_extracted], axis=1)

# Rename columns to match expected names for JSON conversion
df_final = df_final.rename(columns={
    "Definition 1 (Sleep Difficulty)": "Definition 1 Pred",
    "Definition 2 (Daytime Impairment)": "Definition 2 Pred",
    "Rule A (Insomnia Diagnosis)": "Rule A Pred",
    "Rule B (Primary Medications)": "Rule B Pred",
    "Rule C (Secondary Medications)": "Rule C Pred",
    "Final Insomnia Status": "Insomnia Pred",
    "Definition 1 Extracted": "Definition 1 Evidence",
    "Definition 2 Extracted": "Definition 2 Evidence",
    "Rule B Extracted": "Rule B Evidence",
    "Rule C Extracted": "Rule C Evidence"
})

In [8]:
df.head()

Unnamed: 0,note_id,text,Definition 1,Definition 2,Rule A,Rule B,Rule C,Insomnia
0,1280230,female patient in sixties prescribed no drugs\...,yes,no,no,no,no,no
1,1286616,male patient in fifties prescribed no drugs\n\...,yes,yes,yes,no,yes,yes
2,1291281,female patient in seventies prescribed no drug...,no,no,no,no,no,no
3,1293851,female patient in seventies prescribed Nitrogl...,yes,yes,yes,yes,yes,yes
4,1295870,male patient in seventies prescribed Pantopraz...,yes,yes,yes,no,yes,yes


In [9]:
output_csv_dir = "/home/ubuntu/Desktop/SMM4H-2025/notebooks/results/validation"
# Save CSV to results folder
csv_output_path = os.path.join(output_csv_dir, "output.csv")
df_final.to_csv(csv_output_path, index=False)

In [10]:
# Generate JSON outputs
convert_output_to_json(csv_output_path)

JSON file saved at results/subtask_1.json
JSON file saved at results/subtask_2a.json
JSON file saved at results/subtask_2b.json


In [15]:
import json

# Define the paths
paths = [
    "results/validation/subtask_1.json",
    "results/validation/subtask_2a.json",
    "results/validation/subtask_2b.json"
]

for path in paths:
    print(f"\nContents of {path} (first 5 entries):")
    with open(path, "r") as f:
        data = json.load(f)
        
        if isinstance(data, list):
            for item in data[:5]:
                print(item)
        elif isinstance(data, dict):
            for i, (k, v) in enumerate(data.items()):
                print({k: v})
                if i == 4:
                    break
        else:
            print("Unsupported JSON structure.")



Contents of results/validation/subtask_1.json (first 5 entries):
{'1280230.0': {'Insomnia': 'yes'}}
{'1286616.0': {'Insomnia': 'yes'}}
{'1291281.0': {'Insomnia': 'no'}}
{'1293851.0': {'Insomnia': 'yes'}}
{'1295870.0': {'Insomnia': 'yes'}}

Contents of results/validation/subtask_2a.json (first 5 entries):
{'1280230.0': {'Definition 1': 'yes', 'Definition 2': 'yes', 'Rule A': 'yes', 'Rule B': 'no', 'Rule C': 'no'}}
{'1286616.0': {'Definition 1': 'no', 'Definition 2': 'yes', 'Rule A': 'no', 'Rule B': 'no', 'Rule C': 'yes'}}
{'1291281.0': {'Definition 1': 'no', 'Definition 2': 'yes', 'Rule A': 'no', 'Rule B': 'no', 'Rule C': 'no'}}
{'1293851.0': {'Definition 1': 'yes', 'Definition 2': 'yes', 'Rule A': 'yes', 'Rule B': 'yes', 'Rule C': 'yes'}}
{'1295870.0': {'Definition 1': 'yes', 'Definition 2': 'yes', 'Rule A': 'yes', 'Rule B': 'no', 'Rule C': 'no'}}

Contents of results/validation/subtask_2b.json (first 5 entries):
{'1280230.0': {'Definition 1': {'label': 'yes', 'text': ['unable to slee