# Start Here

In [None]:
!pip install openai tiktoken

import json
import os
from google.colab import auth, userdata, files
from google.auth import default
from openai import OpenAI
import tiktoken

# OpenAI setup
client = OpenAI(api_key=userdata.get('OPEN_AI'))

# Function to count tokens
def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-4o-mini-2024-07-18")
    return len(encoding.encode(text))



In [None]:
# step1_prepare_data.py
import pandas as pd
import json

def load_and_prepare_data():
    # Read the CSV file
    try:
        df = pd.read_csv('Adjudicated_100_dataset.csv')
        print(f"Successfully loaded CSV with {len(df)} rows")

        # Convert to list of dictionaries
        dataset = []
        for index, row in df.iterrows():
            post_dict = {
                'post_id': str(index),
                'body': row['body']
            }
            dataset.append(post_dict)

        # Save to JSON for inspection
        with open('prepared_dataset.json', 'w') as f:
            json.dump(dataset, f, indent=2)

        print(f"Saved {len(dataset)} posts to prepared_dataset.json")
        return dataset

    except Exception as e:
        print(f"Error preparing data: {str(e)}")
        return None

if __name__ == "__main__":
    load_and_prepare_data()

Successfully loaded CSV with 100 rows
Saved 100 posts to prepared_dataset.json


# Baseline

In [None]:
import json
from openai import OpenAI
from tqdm.notebook import tqdm

def analyze_post(post_id, post_body):
    print(f"Analyzing post_id: {post_id}")
    print(f"body (first 100 chars): {post_body[:100]}...")

    try:
        response = client.chat.completions.create(
    model="gpt-4o-mini-2024-07-18",
    messages=[{
        "role": "system",
        "content": """You are analyzing Reddit posts about GLP1 medications and cancer discussions. Follow these detailed guidelines for each variable:
                Annotation Guideline 2.0

                For all the variables, we are shifting to an actual 'mention of' approach, so please evaluate the text specifically rather than the broader context or post.
                Please do not change the variables without discussing with me first.

                1. Inclusion

                Definition: Include instances discussing GLP1 medication in the context of cancer. (Unchanged from before)
                Labeling: Yes or No, depending on whether the post clearly discusses GLP1 medications in relation to cancer.

                2. Exclusion_reason

                Definition: Reason for excluding an instance. (Unchanged from before)
                Labeling: Examples in drop down.

                3. Is_survivor

                Definition: Indicates if the post mentions a cancer survivor.
                Labeling: Yes or No, depending on whether the individual mentions surviving cancer.
                Example: "I've had cancer treatment" -> Yes.

                4. Is_survivor_and_taking_med

                Definition: Indicates if a cancer survivor is also taking GLP1 medication.
                Labeling: Yes or No.

                5. Family_cancer_history

                Definition: Mentions of family history of cancer.
                Labeling: Yes or No.

                6. Cancer_type
                Definition: Type of cancer mentioned.
                Labeling: Pancreatic, Thyroid, Breast, Gyn, Other, No type mentioned.

                7. Other_cancer_type
                Definition: Mentions of cancer types not otherwise listed or unusual cancers.
                *Labeling: *if an unusual or unlisted type of cancer is mentioned, type in string.

                8. Is_survivor_weight_loss

                Definition: Indicates if the survivor experienced weight loss.
                Labeling: Yes or No, depending on whether weight loss is explicitly mentioned in connection to being a survivor.

                9. Cancer_diagnosis_after_medication
                Definition: Mentions of cancer diagnosis occurring AFTER starting GLP1 medication.
                Labeling: Yes or No.

                10. Mentions_cancer_risk

                Definition: Mentions related to increased cancer risk and GLP1 medication. Does not need to be related to humans; if they mention rats or other animals, label as Yes. If they mention a decrease in cancer risk, label as No (note: another variable exists for this).
                Labeling: Yes or No.

                11. Concerned_about_cancer_risk
                Definition: Mentions concern regarding cancer risk (e.g., 'I'm not concerned,' 'I am concerned').

                Labeling: Yes or No.
                12. Seeking_cancer_risk_data

                Definition: The text mentions the author seeking information or data on cancer risk.
                Labeling: Yes or No.

                13. Can_assess_misinformation
                Definition: If the content is a personal anecdote, label as No. If making generalizable comments (e.g., 'GLP1 never causes cancer,' 'Obesity causes cancer'), label as Yes.
                Labeling: Yes or No.

                14. Sentiment_score

                Definition: Emotional sentiment of the post, not the general sentiment towards the medication.
                Labeling: Assign a sentiment score of -1 (negative), 0 (neutral), or 1 (positive).

                15. Discussed_risk_with_physician
                Definition: Mentions discussing cancer risk with a healthcare professional, even if the post is recommending someone talks to their own physician about it.
                Labeling: Yes or No

                16. Tone
                Definition: Overall tone of the discussion.
                Labeling: Select tone from a dropdown menu (e.g., extremely concerned, somewhat concerned, neutral, dismissive of concern, reassuring/positive, angry/frustrated).

                17. Context

                Definition: Provides context for the discussion (e.g., personal story, research, news).
                Labeling: Select context from a dropdown menu (e.g., General Inquiry, Personal Experience, Family/Friend Experience, Information from Health Organization (FDA, CDC, etc.), Information from Scientific Publication, Information from Social Media, Information from Television, Information from Other Internet Source).

                18. Misinformation

                Definition: Presence of misinformation regarding GLP1 and cancer.
                Labeling: Select from dropdown menu:
                ·   	N/A
                ·   	0 - Factually correct information
                ·   	1 - Mostly correct
                ·   	2 - Mixed information
                ·   	3 - Mostly incorrect
                ·   	4 - Completely incorrect
                ·   	5 - Deliberate misinformation

                19. discussion_GLP1_decreasing_cancer_risk

                Definition: Mentions that GLP1 may reduce cancer risk.
                Labeling: Yes or No.
                """
                }, {
        "role": "user",
        "content": f"Analyze this post:\n\nPost ID: {post_id}\n\nPost Body: {post_body}"
    }],
    temperature=0.1,
    max_tokens=512,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "cancer_post_analysis_limited",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "post_id": {
                        "type": "string",
                        "description": "Unique identifier for the Reddit post"
                    },
                    "inclusion": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Whether to include the post based on cancer mentions"
                    },
                    "exclusion_reason": {
                        "type": "string",
                        "description": "Reason for exclusion if applicable"
                    },
                    "is_survivor": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "If the post indicates a cancer survivor"
                    },
                    "is_survivor_and_taking_med": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Indicates if a cancer survivor is also taking GLP1 medication"
                    },
                    "family_cancer_history": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Mentions of family cancer history"
                    },
                    "cancer_type": {
                        "type": "string",
                        "enum": ["Pancreatic Cancer", "Thyroid Cancer", "Breast Cancer", "Gyn cancer", "Other", "No type mentioned"],
                        "description": "Type of cancer mentioned"
                    },
                    "other_cancer_type": {
                        "type": "string",
                        "description": "If Other cancer type, specify here"
                    },
                    "is_survivor_weight_loss": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Weight loss after cancer diagnosis"
                    },
                    "cancer_diagnosis_after_medication": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Cancer diagnosed after starting medication"
                    },
                    "mentions_cancer_risk": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Mentions related to increased cancer risk and GLP1 medication"
                    },
                    "concerned_about_cancer_risk": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Expresses concern about cancer risk"
                    },
                    "seeking_cancer_risk_data": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Seeking information about cancer risk"
                    },
                    "can_assess_misinformation": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Can assess for misinformation"
                    },
                    "sentiment_score": {
                        "type": "integer",
                        "enum": [-1, 0, 1],
                        "description": "Sentiment score (-1 to 1)"
                    },
                    "discussed_risk_with_physician": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Mentions discussing cancer risk with healthcare professional"
                    },
                    "discussion_GLP1_decreasing_cancer_risk": {
                        "type": "string",
                        "enum": ["Yes", "No"],
                        "description": "Mentions GLP1 reducing cancer risk"
                    }
                },
                "required": [
                    "post_id",
                    "inclusion",
                    "exclusion_reason",
                    "is_survivor",
                    "is_survivor_and_taking_med",
                    "family_cancer_history",
                    "cancer_type",
                    "other_cancer_type",
                    "is_survivor_weight_loss",
                    "cancer_diagnosis_after_medication",
                    "mentions_cancer_risk",
                    "concerned_about_cancer_risk",
                    "seeking_cancer_risk_data",
                    "can_assess_misinformation",
                    "sentiment_score",
                    "discussed_risk_with_physician",
                    "discussion_GLP1_decreasing_cancer_risk"
                ],
                "additionalProperties": False
            }
        }
    }
)

        print(f"Raw API response: {response.choices[0].message.content}")
        return json.loads(response.choices[0].message.content)

    except Exception as e:
        print(f"Error processing post {post_id}: {str(e)}")
        return None

def process_posts(batch_size=10):
    # Load the JSON data
    with open('prepared_dataset.json', 'r') as file:
        data = json.load(file)

    results = []

    # Process posts with progress bar
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(len(data) + batch_size - 1)//batch_size}")

        for post in batch:
            analysis = analyze_post(post['post_id'], post['body'])
            if analysis:
                analysis['original_post'] = post['body']
                results.append(analysis)

        # Save intermediate results after each batch
        with open('analysis_results.json', 'w') as file:
            json.dump(results, file, indent=2)

    print(f"\nAnalysis complete. Processed {len(results)} out of {len(data)} posts")
    print("Results saved to analysis_results.json")
    return results

if __name__ == "__main__":
    process_posts()

import pandas as pd
import json
import numpy as np

# File paths
csv_file_path = 'Adjudicated_100_dataset_clean.csv'
json_file_path = 'analysis_results.json'
merged_json_path = 'merged_analysis_results.json'

# Load the CSV file, replacing NaN with empty strings
csv_data = pd.read_csv(csv_file_path).replace(np.nan, "", regex=True)

# Load the JSON file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Add .llm suffix to keys except 'original_post' and 'post_id'
modified_json_data = []
for post in json_data:
    modified_post = {}
    for key, value in post.items():
        if key not in ['original_post', 'post_id']:
            modified_post[f"{key}.llm"] = value
        else:
            modified_post[key] = value
    modified_json_data.append(modified_post)

# Standardize text for matching
csv_data['body_cleaned'] = csv_data['body'].str.lower().str.strip()
for post in modified_json_data:
    post['original_post_cleaned'] = post['original_post'].lower().strip()

# Create a dictionary from the CSV for quick lookup
csv_lookup = csv_data.set_index('body_cleaned').to_dict(orient='index')

# Merge CSV data into JSON based on matching text
for post in modified_json_data:
    match_data = csv_lookup.get(post['original_post_cleaned'])
    if match_data:
        post.update(match_data)

# Clean up temporary fields
for post in modified_json_data:
    post.pop('original_post_cleaned', None)
csv_data = csv_data.drop(columns=['body_cleaned'])

# Save the merged data
with open(merged_json_path, 'w') as file:
    json.dump(modified_json_data, file, indent=2)

print(f"Merged data saved to {merged_json_path}")

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

def calculate_metrics(y_true, y_pred):
    """Calculate metrics for categorical variables"""
    try:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        return accuracy, precision, recall, f1
    except:
        return None, None, None, None

# Load the JSON data
with open('merged_analysis_results.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Initialize results dictionary
metrics_results = {}
mismatches = []
mismatch_counts = {}

# Get all pairs of corresponding columns (llm and human)
llm_cols = [col for col in df.columns if col.endswith('.llm')]
human_cols = [col.replace('.llm', '.human') for col in llm_cols]

# First calculate metrics for inclusion
inclusion_metrics = calculate_metrics(
    df['inclusion.human'],
    df['inclusion.llm']
)
metrics_results['inclusion'] = {
    'Accuracy': inclusion_metrics[0],
    'Precision': inclusion_metrics[1],
    'Recall': inclusion_metrics[2],
    'F1': inclusion_metrics[3]
}

# Find inclusion mismatches
inclusion_mismatches = df[df['inclusion.llm'] != df['inclusion.human']]
mismatch_counts['inclusion'] = len(inclusion_mismatches)
for _, row in inclusion_mismatches.iterrows():
    mismatches.append({
        'post_id': row['post_id'],
        'variable': 'inclusion',
        'llm_value': row['inclusion.llm'],
        'human_value': row['inclusion.human'],
        'original_text': row['original_post']
    })

# Filter for posts where inclusion.human is "Yes"
included_df = df[df['inclusion.human'] == 'Yes']

# Calculate metrics for other variables only on included posts
for llm_col, human_col in zip(llm_cols, human_cols):
    if human_col in df.columns and 'inclusion' not in llm_col:
        # Skip sentiment score as it's numerical
        if 'sentiment_score' in llm_col:
            continue

        # Get valid rows (non-empty in both columns)
        valid_mask = (included_df[llm_col].notna()) & (included_df[human_col].notna())
        y_true = included_df[human_col][valid_mask]
        y_pred = included_df[llm_col][valid_mask]

        # Calculate metrics
        accuracy, precision, recall, f1 = calculate_metrics(y_true, y_pred)

        # Store results
        base_col = llm_col.replace('.llm', '')
        metrics_results[base_col] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }

        # Find mismatches
        mismatch_mask = (included_df[llm_col] != included_df[human_col]) & valid_mask
        mismatch_df = included_df[mismatch_mask][['post_id', llm_col, human_col, 'original_post']]

        # Count mismatches for this variable
        mismatch_counts[base_col] = len(mismatch_df)

        for _, row in mismatch_df.iterrows():
            mismatches.append({
                'post_id': row['post_id'],
                'variable': base_col,
                'llm_value': row[llm_col],
                'human_value': row[human_col],
                'original_text': row['original_post']
            })

# Create results DataFrame
metrics_df = pd.DataFrame(metrics_results).T
metrics_df['Mismatches'] = pd.Series(mismatch_counts)
metrics_df = metrics_df.round(3)

# Create mismatches DataFrame
mismatches_df = pd.DataFrame(mismatches)

# Save results
metrics_df.to_csv('metrics_summary.csv')
mismatches_df.to_csv('mismatches.csv', index=False)

# Print summary
print("\nMetrics Summary:")
print(metrics_df)

print("\nMismatch Counts per Variable:")
for var, count in mismatch_counts.items():
    print(f"{var}: {count} mismatches")

print("\nTotal number of mismatches:", len(mismatches))
print("Included posts:", len(included_df))
print("Total posts:", len(df))


Processing batch 1 of 10
Analyzing post_id: 0
body (first 100 chars): My husband had papillary thyroid cancer and he takes Wegovy....
Raw API response: {"post_id":"0","inclusion":"Yes","exclusion_reason":"","is_survivor":"Yes","is_survivor_and_taking_med":"Yes","family_cancer_history":"No","cancer_type":"Thyroid Cancer","other_cancer_type":"","is_survivor_weight_loss":"No","cancer_diagnosis_after_medication":"No","mentions_cancer_risk":"No","concerned_about_cancer_risk":"No","seeking_cancer_risk_data":"No","can_assess_misinformation":"No","sentiment_score":0,"discussed_risk_with_physician":"No","discussion_GLP1_decreasing_cancer_risk":"No"}
Analyzing post_id: 1
body (first 100 chars): How many drugs have noted dangerous side effects. From giving you cancer to heart attacks. I am pret...
Raw API response: {"post_id":"1","inclusion":"No","exclusion_reason":"Does not discuss GLP1 medications in relation to cancer","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_histor

# **Fine-tuned**

In [None]:
import json
from openai import OpenAI
from tqdm.notebook import tqdm

def analyze_post(post_id, post_body):
    print(f"Analyzing post_id: {post_id}")
    print(f"body (first 100 chars): {post_body[:100]}...")

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
               {
                    "role": "system",
                    "content": """You are analyzing Reddit posts about GLP1 medications and cancer discussions. Follow these detailed guidelines for each variable:
When analyzing each post, follow these sequential reasoning steps:

1. INCLUSION CHECK (Primary Gateway):
   - First, scan for ANY mention of cancer (treatment, risk, research, personal story, etc.)
   - Does the post discuss cancer in any context? (personal, family, research, risk, screening)
   - Exclude ONLY if cancer is purely metaphorical or completely absent
   - If uncertain, default to inclusion
   - Remember: Even general statements about cancer risks count for inclusion

2. CANCER TYPE IDENTIFICATION:CANCER TYPE IDENTIFICATION:
  - Scan for cancer mentions and categorize:
      Thyroid mentions → "Thyroid Cancer" (use low threshold)
      Breast → "Breast Cancer"
      Pancreatic/Pancreas → "Pancreatic Cancer"
      Gynecologic (uterine, ovarian, cervical) → "Gyn Cancer"
      Any other specific cancer → "Other" (specify in other_cancer_type)
      No specific type found → "No Type Mentioned"
  - Key hints:
      Surgery mentions can indicate type (thyroidectomy → "Thyroid Cancer")
      Histology terms help (papillary, medullary → "Thyroid Cancer")
      If thyroid is mentioned and post is included, default to "Thyroid Cancer"
      If specific disease site think of the general cancer type (anal cancer → "Other" → "GI Cancer", leukemia → "Other" → "Blood Cancer")

3. RISK ASSESSMENT LAYERS:
  3.1)CANCER RISK ASSESSMENT:
        a) Direct Risk Mentions:

        Look for explicit discussions (e.g., "risk", "chance", "likelihood")
        Check for warning labels and side effect discussions
        Remember: Research/studies/statistics all count

        b) Implied Risk Language:

        Weight/obesity related cancer discussions
        Prevention and screening discussions
        Family history mentioned with future concerns
        Remember: Words like "worried about getting" or "afraid of developing" signal risk

        c) Non-Risk Cancer Talk:

        Current/past cancer experiences aren't risk
        Treatment discussions without future implications
        General cancer support/emotional content
        Remember: Focus on FUTURE possibility of cancer

        d) Common Risk Contexts:

        Medication warnings (especially thyroid)
        Obesity as risk factor
        Family history implications
        Remember: Risk needs future context to be assessed

   3.2) Concern About Risk:
      - Look for emotional language (worried, scared, anxious)
      - Check for information-seeking behavior
      - Consider family history context
      - Default to NO if purely informational

   3.3) Risk Information Seeking:
      - Look for questions about studies/research
      - Check for requests for others' experiences
      - Note requests for medical guidance

4.1 SURVIVOR STATUS:
   - Look for mentions of:
     * Past/current treatment
     * Surgery history
     * Cancer recovery
     * Current monitoring
   - Remember: Current treatment counts as survivor

4.2 SURVIVOR STATUS AND TAKING MEDICATION:
   - If the survivor status is yes then look at any indication in the post that indicated being on the medication
   - If mention of weight loss then they probably are on the medication so if you find that and yes for survivor label as yes
   - Look for verbs that indicate the poster is taking the medication
   - Remember: Current treatment counts as survivor


4.3 SURVIVOR AND WEIGHT LOSS:
  - If survivor status is yes, then look at indication of wanting to lose weight
  - Tense doesnt matter, meaning if interested in the future in taking the med to lose weight then this counts
  - Remember: Needs to be a survivor based on 4.1

5. PHYSICIAN DISCUSSION:
   - Must specifically discuss cancer risk with healthcare provider
   - Look for physician input about cancer risks
   - General medical mentions don't count
   - Future plans to discuss do count
   - Think about it that way: Did the post mention they are going to talk to or talked to a physician? If Yes, then what are they talking about? cancer risk or general risks imply a discussion about cancer risk


6. MISINFORMATION ASSESSMENT:
   - Look for absolute claims about causation
   - Check for overgeneralization of research findings
   - Personal experiences aren't misinformation
   - Consider scientific accuracy of risk statements
   - Remember: Sharing factual research isn't misinformation

7. TEMPORAL RELATIONSHIPS:
   - Note any cancer diagnoses after starting medication
   - Look for suggested connections between medication and cancer
   - Consider timing mentions even if causation isn't claimed

8.1 MENTION OF RISK:
  - Note any mention of risk explicit or implicit related to cancer should be included
  - If mention of cancer, focus on any indication in the text related to risk.
  - This is likely to be positive if other cancer related factors are positive so exercise caution

8.2 CONCERNED ABOUT RISK:
  - if the mention of risk is positive, then you should note if they are concerned or worried or synonyms about that
  - Remember being concerned can be implicit in the wording and phrasing

9. DISCUSSION RELATED TO DECREASING CANCER RISK
  - Note if the post makes any claim regarding being on the medication decreases the risk of cancer, direcly or through weight loss

This systematic analysis should be applied to EACH post before making final classifications.

# EXAMPLE ANALYSES TO GUIDE YOUR EVALUATION

Example 1: "I was diagnosed with thyroid cancer 3 months after starting Ozempic. My doctor isn't sure if there's any connection, but I'm working with my oncologist to understand if I can continue the medication. My sister also had breast cancer, so I'm extra cautious about any risks."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "Yes",
    "is_survivor_and_taking_med": "Yes",
    "family_cancer_history": "Yes",
    "cancer_type": "Thyroid Cancer",
    "other_cancer_type": "",
    "is_survivor_weight_loss": "No",
    "cancer_diagnosis_after_medication": "Yes",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",
    "seeking_cancer_risk_data": "Yes",
    "can_assess_misinformation": "No",
    "sentiment_score": -1,
    "discussed_with_physician": "Yes",
    "discussion_GLP1_decreasing_cancer_risk": "No"
}
Reasoning: Comprehensive example showing temporal relationship with medication, family history, medical consultation, and personal experience without making broader claims.

Example 2: "Studies prove GLP1s definitely cause cancer in everyone who takes them - just look at the rat studies! Don't trust doctors who say otherwise. These drugs are killing people and Big Pharma is hiding the truth."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "No",
    "is_survivor_and_taking_med": "No",
    "family_cancer_history": "No",
    "cancer_type": "No Type Mentioned",
    "other_cancer_type": "",
    "is_survivor_weight_loss": "No",
    "cancer_diagnosis_after_medication": "No",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",
    "seeking_cancer_risk_data": "No",
    "can_assess_misinformation": "Yes",
    "sentiment_score": -1,
    "discussed_with_physician": "No",
    "discussion_GLP1_decreasing_cancer_risk": "No"
}
Reasoning: Classic misinformation example making absolute claims without evidence, over-generalizing animal studies.

Example 3: "I put on so much weigh after my history with breast cancer, but I've lost 50 pounds on Wegovy. My oncologist actually recommended it since obesity increases cancer recurrence risk. The weight loss has helped reduce my inflammation markers."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "Yes",
    "family_cancer_history": "No",
    "cancer_type": "Breast Cancer",
    "other_cancer_type": "",
    "is_survivor_and_taking_med": "Yes",
    "is_survivor_weight_loss": "Yes",
    "cancer_diagnosis_after_medication": "No",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "No",
    "seeking_cancer_risk_data": "No",
    "can_assess_misinformation": "No",
    "sentiment_score": 1,
    "discussed_with_physician": "Yes",
    "discussion_GLP1_decreasing_cancer_risk": "Yes"
}
Reasoning: Shows positive medical guidance, personal experience with specific cancer type, and therapeutic benefit without making general claims.

Example 4: "Anyone else worried about the thyroid cancer warning? My dad had thyroid cancer and I'm scared to start, but my doctor says that was papillary type and the warning is about medullary which is different."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "No",
    "is_survivor_and_taking_med": "No",
    "family_cancer_history": "Yes",
    "cancer_type": "Thyroid Cancer",
    "other_cancer_type": "",
    "is_survivor_weight_loss": "No",
    "cancer_diagnosis_after_medication": "No",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",
    "seeking_cancer_risk_data": "Yes",
    "can_assess_misinformation": "Yes",
    "sentiment_score": -1,
    "discussed_with_physician": "Yes",
    "discussion_GLP1_decreasing_cancer_risk": "No"
}
Reasoning: Shows concern about specific cancer types, family history influence, and medical consultation for risk assessment.

Example 5: "Fighting cancer and gained weight from the treatments. Looking forward to beating this and then trying Mounjaro to get healthy again. My medical team says I need to wait until treatment is done."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "Yes",
    "is_survivor_and_taking_med": "No"
    "family_cancer_history": "No",
    "cancer_type": "No Type Mentioned",
    "other_cancer_type": "",
    "is_survivor_weight_loss": "Yes",
    "cancer_diagnosis_after_medication": "No",
    "mentions_cancer_risk": "No",
    "concerned_about_cancer_risk": "No",
    "seeking_cancer_risk_data": "No",
    "can_assess_misinformation": "No",
    "sentiment_score": 0,
    "discussed_with_physician": "Yes",
    "discussion_GLP1_decreasing_cancer_risk": "No"
}
Reasoning: Current cancer patient discussing medication timing, showing medical guidance without risk discussion.

# Extra examples to help

# EXAMPLE ANALYSES TO GUIDE YOUR EVALUATION

Example 6: "My doctor mentioned the thyroid cancer warning but said it was only in mice. I have papillary thyroid nodules that get monitored regularly, but he said that's different from the medullary thyroid cancer in the studies."
{
    "inclusion": "Yes",  # Any mention of cancer, including study findings
    "cancer_type": "Thyroid Cancer",  # Specific cancer type with histology
    "mentions_cancer_risk": "Yes",  # Discusses research findings
    "concerned_about_cancer_risk": "No",  # Informational, not expressing worry
    "discussed_with_physician": "Yes",
    "is_survivor": "No"
}
Reasoning: Shows proper inclusion for any cancer mention, even when discussing research/animal studies without personal concern.

Example 7: "Everything seems to cause cancer these days, but I'm especially worried about starting Wegovy with my family history of thyroid problems. Going to ask my doctor about screening first."
{
    "inclusion": "Yes",  # General cancer discussion plus medication context
    "cancer_type": "Thyroid Cancer",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",  # Shows anxiety and seeking screening
    "discussed_with_physician": "No",  # Plans to discuss but hasn't yet
    "is_survivor": "No"
}
Reasoning: Captures inclusion for general cancer discussion and concerned_about_cancer_risk for screening questions.

Example 8: "After my thyroidectomy for cancer, I talked to my physician about GLP1s. Looking at research papers about cancer risk in patients with history - anyone else find good studies?"
{
    "inclusion": "Yes",  # Cancer history + research discussion
    "cancer_type": "Thyroid Cancer",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",  # Actively seeking risk information
    "discussed_with_physician": "Yes",
    "is_survivor": "Yes"
}
Reasoning: Shows inclusion for research discussion and concern indicated by information seeking.

Example 9: "My GP discussed with me the risk of pancereatitis. In regard to cancer, animal studies show cancer risks but human data is different. Looking at population studies, obesity probably increases cancer risk more than these medications."
{
    "inclusion": "Yes",  # Discussion of cancer research findings
    "cancer_type": "No Type Mentioned",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "No",  # Purely informational discussion
    "discussed_with_physician": "No", # Discussed with physician but not cancer related risk discussion
    "is_survivor": "No"
}
Reasoning: Shows inclusion for research discussion without personal concern.

Example 10: "My husband had papillary cancer and takes the medication, he discussed with his physician that cancer risk is low be could occur"
{
    "inclusion": "Yes",  # Multiple cancer mentions and concerns
    "cancer_type": "Thyroid Cancer",  # Mentions both breast and thyroid
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",
    "discussed_with_physician": "Yes",
    "is_survivor": "Yes"
}
Reasoning: Shows multiple cancer types, clear anxiety language indicating concern, and proper inclusion for risk discussion. """


                },
                {
                    "role": "user",
                    "content": f"Analyze this post:\n\nPost ID: {post_id}\n\nPost Body: {post_body}"
                }
            ],
            temperature=0.1,
            max_tokens=512,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "cancer_post_analysis_limited",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "post_id": {"type": "string"},
                            "inclusion": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Include if ANY cancer mention (personal/family/risk/research/screening/side effects). Only exclude if cancer is metaphorical or absent."
                            },
                            "exclusion_reason": {
                                "type": "string",
                                "description": "If excluded, specify: 'No cancer mention', 'Cancer used metaphorically', or detailed reason"
                            },
                            "is_survivor": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: past cancer treatment, remission, recovery, post-cancer experience. No if: current diagnosis, risk discussion, family history"
                            },
                            "is_survivor_and_taking_med": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: confirmed cancer survivor AND currently taking GLP1 medication. Must have evidence of both survival and current medication use. No if: missing either criterion"
                                },
                            "family_cancer_history": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions ANY family member (immediate or extended) having cancer currently or in past. No if: no family cancer mentions"
                            },
                            "cancer_type": {
                                "type": "string",
                                "enum": ["Thyroid Cancer", "Breast Cancer", "Pancreatic Cancer", "Gyn cancer", "Other", "No Type Mentioned"],
                                "description": "Select first specific cancer type mentioned. Use 'Other' only for clearly specified non-main types, only use No Type Mentioned if no type can be inferred"
                            },
                            "other_cancer_type": {
                                "type": "string",
                                "description": "If 'Other' selected above, specify the exact cancer type mentioned"
                            },
                            "is_survivor_weight_loss": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: weight changes after cancer treatment/survival mentioned. No if: general weight discussion or unclear timing"
                            },
                            "cancer_diagnosis_after_medication": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: clear temporal sequence shows cancer diagnosis after starting GLP1. No if: unclear timing or just risk discussion"
                            },
                             "mentions_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: any mention of cancer risk or increased risk, includes family history concerns. Synonyms of cancer risk"
                             },
                            "concerned_about_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: expresses worry/fear about cancer risk, includes family history concerns. No if: purely informational"
                            },
                            "seeking_cancer_risk_data": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: actively asking for studies/research/experiences about cancer risk. No if: sharing info or general discussion"
                            },
                            "can_assess_misinformation": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: makes generalizable claims about medication/cancer. No if: personal experience only or questions"
                            },
                            "sentiment_score": {
                                "type": "integer",
                                "enum": [-1, 0, 1],
                                "description": "-1: fear/worry/grief, 0: factual/balanced, 1: hope/success/gratitude"
                            },
                            "discussed_risk_with_physician": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions discussing cancer risk with ANY healthcare provider or recommending such discussion. No if: no provider discussion mentioned"
                            },
                            "discussion_GLP1_decreasing_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: explicitly mentions GLP1/weight loss reducing cancer risk. No if: no mention of risk reduction"
                            }
                        },
                        "required": [
                            "post_id",
                            "inclusion",
                            "exclusion_reason",
                            "is_survivor",
                            "is_survivor_and_taking_med",
                            "family_cancer_history",
                            "cancer_type",
                            "other_cancer_type",
                            "is_survivor_weight_loss",
                            "cancer_diagnosis_after_medication",
                            "mentions_cancer_risk",
                            "concerned_about_cancer_risk",
                            "seeking_cancer_risk_data",
                            "can_assess_misinformation",
                            "sentiment_score",
                            "discussed_risk_with_physician",
                            "discussion_GLP1_decreasing_cancer_risk"
                        ],
                        "additionalProperties": False
                    }
                }
            }
        )

        print(f"Raw API response: {response.choices[0].message.content}")
        return json.loads(response.choices[0].message.content)

    except Exception as e:
        print(f"Error processing post {post_id}: {str(e)}")
        return None

def process_posts(batch_size=10):
    # Load the JSON data
    with open('prepared_dataset.json', 'r') as file:
        data = json.load(file)

    results = []

    # Process posts with progress bar
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(len(data) + batch_size - 1)//batch_size}")

        for post in batch:
            analysis = analyze_post(post['post_id'], post['body'])
            if analysis:
                analysis['original_post'] = post['body']
                results.append(analysis)

        # Save intermediate results after each batch
        with open('analysis_results.json', 'w') as file:
            json.dump(results, file, indent=2)

    print(f"\nAnalysis complete. Processed {len(results)} out of {len(data)} posts")
    print("Results saved to analysis_results.json")
    return results

if __name__ == "__main__":
    process_posts()

import pandas as pd
import json
import numpy as np

# File paths
csv_file_path = 'Adjudicated_100_dataset_clean.csv'
json_file_path = 'analysis_results.json'
merged_json_path = 'merged_analysis_results.json'

# Load the CSV file, replacing NaN with empty strings
csv_data = pd.read_csv(csv_file_path).replace(np.nan, "", regex=True)

# Load the JSON file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Add .llm suffix to keys except 'original_post' and 'post_id'
modified_json_data = []
for post in json_data:
    modified_post = {}
    for key, value in post.items():
        if key not in ['original_post', 'post_id']:
            modified_post[f"{key}.llm"] = value
        else:
            modified_post[key] = value
    modified_json_data.append(modified_post)

# Standardize text for matching
csv_data['body_cleaned'] = csv_data['body'].str.lower().str.strip()
for post in modified_json_data:
    post['original_post_cleaned'] = post['original_post'].lower().strip()

# Create a dictionary from the CSV for quick lookup
csv_lookup = csv_data.set_index('body_cleaned').to_dict(orient='index')

# Merge CSV data into JSON based on matching text
for post in modified_json_data:
    match_data = csv_lookup.get(post['original_post_cleaned'])
    if match_data:
        post.update(match_data)

# Clean up temporary fields
for post in modified_json_data:
    post.pop('original_post_cleaned', None)
csv_data = csv_data.drop(columns=['body_cleaned'])

# Save the merged data
with open(merged_json_path, 'w') as file:
    json.dump(modified_json_data, file, indent=2)

print(f"Merged data saved to {merged_json_path}")

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

def calculate_metrics(y_true, y_pred):
    """Calculate metrics for categorical variables"""
    try:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        return accuracy, precision, recall, f1
    except:
        return None, None, None, None

# Load the JSON data
with open('merged_analysis_results.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Initialize results dictionary
metrics_results = {}
mismatches = []
mismatch_counts = {}

# Get all pairs of corresponding columns (llm and human)
llm_cols = [col for col in df.columns if col.endswith('.llm')]
human_cols = [col.replace('.llm', '.human') for col in llm_cols]

# First calculate metrics for inclusion
inclusion_metrics = calculate_metrics(
    df['inclusion.human'],
    df['inclusion.llm']
)
metrics_results['inclusion'] = {
    'Accuracy': inclusion_metrics[0],
    'Precision': inclusion_metrics[1],
    'Recall': inclusion_metrics[2],
    'F1': inclusion_metrics[3]
}

# Find inclusion mismatches
inclusion_mismatches = df[df['inclusion.llm'] != df['inclusion.human']]
mismatch_counts['inclusion'] = len(inclusion_mismatches)
for _, row in inclusion_mismatches.iterrows():
    mismatches.append({
        'post_id': row['post_id'],
        'variable': 'inclusion',
        'llm_value': row['inclusion.llm'],
        'human_value': row['inclusion.human'],
        'original_text': row['original_post']
    })

# Filter for posts where inclusion.human is "Yes"
included_df = df[df['inclusion.human'] == 'Yes']

# Calculate metrics for other variables only on included posts
for llm_col, human_col in zip(llm_cols, human_cols):
    if human_col in df.columns and 'inclusion' not in llm_col:
        # Skip sentiment score as it's numerical
        if 'sentiment_score' in llm_col:
            continue

        # Get valid rows (non-empty in both columns)
        valid_mask = (included_df[llm_col].notna()) & (included_df[human_col].notna())
        y_true = included_df[human_col][valid_mask]
        y_pred = included_df[llm_col][valid_mask]

        # Calculate metrics
        accuracy, precision, recall, f1 = calculate_metrics(y_true, y_pred)

        # Store results
        base_col = llm_col.replace('.llm', '')
        metrics_results[base_col] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }

        # Find mismatches
        mismatch_mask = (included_df[llm_col] != included_df[human_col]) & valid_mask
        mismatch_df = included_df[mismatch_mask][['post_id', llm_col, human_col, 'original_post']]

        # Count mismatches for this variable
        mismatch_counts[base_col] = len(mismatch_df)

        for _, row in mismatch_df.iterrows():
            mismatches.append({
                'post_id': row['post_id'],
                'variable': base_col,
                'llm_value': row[llm_col],
                'human_value': row[human_col],
                'original_text': row['original_post']
            })

# Create results DataFrame
metrics_df = pd.DataFrame(metrics_results).T
metrics_df['Mismatches'] = pd.Series(mismatch_counts)
metrics_df = metrics_df.round(3)

# Create mismatches DataFrame
mismatches_df = pd.DataFrame(mismatches)

# Save results
metrics_df.to_csv('metrics_summary.csv')
mismatches_df.to_csv('mismatches.csv', index=False)

# Print summary
print("\nMetrics Summary:")
print(metrics_df)

print("\nMismatch Counts per Variable:")
for var, count in mismatch_counts.items():
    print(f"{var}: {count} mismatches")

print("\nTotal number of mismatches:", len(mismatches))
print("Included posts:", len(included_df))
print("Total posts:", len(df))


Processing batch 1 of 10
Analyzing post_id: 0
body (first 100 chars): My husband had papillary thyroid cancer and he takes Wegovy....
Raw API response: {"post_id":"0","inclusion":"Yes","exclusion_reason":"","is_survivor":"Yes","is_survivor_and_taking_med":"Yes","family_cancer_history":"No","cancer_type":"Thyroid Cancer","other_cancer_type":"","is_survivor_weight_loss":"No","cancer_diagnosis_after_medication":"No","mentions_cancer_risk":"No","concerned_about_cancer_risk":"No","seeking_cancer_risk_data":"No","can_assess_misinformation":"No","sentiment_score":0,"discussed_risk_with_physician":"No","discussion_GLP1_decreasing_cancer_risk":"No"}
Analyzing post_id: 1
body (first 100 chars): How many drugs have noted dangerous side effects. From giving you cancer to heart attacks. I am pret...
Raw API response: {"post_id":"1","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"No","cancer_type":"No Type Mentioned","other_cance

Few-shot prompting

In [None]:
import json
from openai import OpenAI
from tqdm.notebook import tqdm

def analyze_post(post_id, post_body):
    print(f"Analyzing post_id: {post_id}")
    print(f"body (first 100 chars): {post_body[:100]}...")

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
               {
                    "role": "system",
                    "content": """You are analyzing Reddit posts about GLP1 medications and cancer discussions. Follow these detailed guidelines for each variable:


# EXAMPLE ANALYSES TO GUIDE YOUR EVALUATION

        Example Post 1: "I read somewhere that Ozempic definitely causes thyroid cancer in everyone who takes it. Don't let big pharma fool you."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "Yes",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: Makes absolute claims without evidence about cancer causation. Valid cancer mention but spreads misinformation. Not expressing personal concern.

        Example Post 2: "My doctors told me the thyroid cancer risk was only seen in rats and based on careful discussion of my personal medical history, we decided the benefits outweighed the potential risks."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "No",
            "discussed_risk_with_physician": "Yes"
        }
        Reasoning: Personal medical discussion with evidence of physician consultation about cancer risks. Shows consideration of personal risk without making general claims.

        Example Post 3: "Started Mounjaro and lost 50 pounds! I feel like I'm beating cancer before it starts."
        {
            "inclusion": "No",
            "exclusion_reason": "Cancer used metaphorically",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "No"
        }
        Reasoning: Cancer mention is purely metaphorical, not discussing actual cancer risk or experience. Personal experience only.

        Example Post 4: "Just sharing research findings: our hospital database shows zero cancer cases from GLP1s, proving they're completely safe."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "Yes",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: Makes broad safety claims and generalizations beyond the data. Not expressing personal concern but making definitive claims about safety.

        Example Post 5: "My aunt got cancer six months after starting Ozempic. Anyone else experience this? I'm terrified to start."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "family_cancer_history": "Yes",
            "cancer_diagnosis_after_medication": "Yes"
        }
        Reasoning: Clear temporal relationship between medication and cancer diagnosis in family member. Shows personal concern and seeking information.

        Example Post 6: "Looking at cancer rates in the general population vs GLP1 users according to latest studies [link]"
        {
            "inclusion": "Yes",
            "exclusion_reason": "General cancer discussion without specific connection to medication effects",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: General discussion of statistical information without personal context or claims. Sharing research without assessment.

        Example Post 1: "I filled my prescription but I'm kind of freaked out about the pancreatitis/gall bladder, and thyroid cancer warnings. Are these things I would need to worry about for the rest of my life?!"
        {
            "cancer_type": "Thyroid Cancer",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "sentiment_score": -1
        }
        Reasoning: Explicitly mentions thyroid cancer warning and shows anxiety about long-term risks. Asking for information about duration of risk.

        Example Post 2: "Man you look awesome. Congrats to losing all that weight and beating cancer"
        {
            "is_survivor": "Yes",
            "cancer_type": "No Type Mentioned",
            "sentiment_score": 1,
            "is_survivor_weight_loss": "Yes"
        }
        Reasoning: Clear reference to having beaten cancer (past tense), with positive sentiment about weight loss post-cancer.

        Example Post 3: "I've had consistent diarrhea for years after having colon cancer and a couple of bowel surgeries... Anyway, the problem with Ozempic diarrhea, for me, is the surprise factor."
        {
            "is_survivor": "Yes",
            "is_survivor_and_taking_med": "Yes",
            "cancer_type": "Other",
            "other_cancer_type": "GI cancer",
            "sentiment_score": 0
        }
        Reasoning: Clearly indicates past colon cancer and current Ozempic use, providing detailed personal experience.

        Example Post 4: "Breast cancer risk is common, unfortunately. But even here, being obese increases your risk more than the pill... I try not to stress about the ones I have no control over and focus on what I realistically can impact."
        {
            "cancer_type": "Breast Cancer",
            "is_survivor_weight_loss": "Yes",
            "concerned_about_cancer_risk": "No",
            "can_assess_misinformation": "No",
            "sentiment_score": 0
        }
        Reasoning: Educational discussion about risk factors, not expressing personal concern but sharing factual information about risk management.

        Example Post 5: "And for those people who told you they got cancer 'shortly after' starting Ozempic, what does 'shortly after' means?"
        {
            "cancer_diagnosis_after_medication": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "can_assess_misinformation": "No",
            "sentiment_score": 0
        }
        Reasoning: Discussing temporal relationship between medication and cancer diagnosis, seeking clarification about timing.

        Example Post 6: "That's wonderful, my hystersister! I had a full hysterectomy with bilateral saplingo-oopherectomy for my 1A endometrial cancer... your weight loss will actually help lower your estrogen-related cancer risk!"
        {
            "is_survivor": "Yes",
            "cancer_type": "Gyn Cancer",
            "discussion_GLP1_decreasing_cancer_risk": "Yes",
            "sentiment_score": 1,
            "concerned_about_cancer_risk": "No"
        }
        Reasoning: Past gynecological cancer mentioned, discussing how weight loss reduces cancer risk, positive sentiment.

        Example Post 7: "So a father having papillary cancer wouldn't prohibit me from being on a GPL1?"
        {
            "family_cancer_history": "Yes",
            "cancer_type": "Thyroid Cancer",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "sentiment_score": -1
        }
        Reasoning: Mentions family history of papillary thyroid cancer, seeking information about medication safety in this context, showing concern.
        Example Post 1: "GLP1s definitely cause cancer in everyone who takes them, just like they did in the rat studies. This is proven fact."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Makes absolute claims about causation without evidence, extends animal studies to humans inappropriately.

        Example Post 2: "When I took Ozempic, I didn't get cancer. My doctor said the risk was low for me."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Personal experience only, not making generalizable claims. Referencing medical advice.

        Example Post 3: "According to recent studies, thyroid cancer risk from GLP1s is completely nonexistent in humans since we lack the specific receptors found in rats."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Makes definitive claims about biological mechanisms and risk, going beyond available evidence.

        Example Post 4: "Got diagnosed with pancreatic cancer 3 months after starting. Doctor isn't sure if there's any connection."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Pancreatic Cancer"
        }
        Reasoning: Sharing personal experience without making broader claims about causation.

        Example Post 5: "The scientific consensus is clear - these drugs have zero cancer risk and anyone who says otherwise is spreading fear."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Makes sweeping claims about scientific consensus and risk, dismissing valid concerns.

        Example Post 6: "My endocrinologist monitors my thyroid closely because of the medication's warning label, but she said medullary thyroid cancer risk is only in rats."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Reporting medical professional's guidance without making generalizable claims.

        Example Post 7: "These medications actually prevent cancer by reducing obesity. The thyroid warnings are just legal protection."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Makes unsupported claims about cancer prevention and dismisses established warnings.

        Example Post 8: "I read the research papers on breast cancer risk. Still discussing with my doctor whether it applies to my situation."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Breast Cancer"
        }
        Reasoning: Discussing personal risk assessment without making broader claims.

        Example Post 9: "Real talk - colon cancer runs in my family, but my doctor says obesity is a bigger risk factor than GLP1s."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Other",
            "other_cancer_type": "GI cancer"
        }
        Reasoning: Sharing medical guidance and personal context without generalizing.

        Example Post 10: "All studies prove that endometrial cancer risk goes down with weight loss from these meds. It's 100% safe for everyone."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Gyn Cancer"
        }
        Reasoning: Overgeneralizes research findings and makes absolute safety claims.
        """


                },
                {
                    "role": "user",
                    "content": f"Analyze this post:\n\nPost ID: {post_id}\n\nPost Body: {post_body}"
                }
            ],
            temperature=0.1,
            max_tokens=512,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "cancer_post_analysis_limited",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "post_id": {"type": "string"},
                            "inclusion": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Include if ANY cancer mention (personal/family/risk/research/screening/side effects). Only exclude if cancer is metaphorical or absent."
                            },
                            "exclusion_reason": {
                                "type": "string",
                                "description": "If excluded, specify: 'No cancer mention', 'Cancer used metaphorically', or detailed reason"
                            },
                            "is_survivor": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: past cancer treatment, remission, recovery, post-cancer experience. No if: current diagnosis, risk discussion, family history"
                            },
                            "is_survivor_and_taking_med": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: confirmed cancer survivor AND currently taking GLP1 medication. Must have evidence of both survival and current medication use. No if: missing either criterion"
                                },
                            "family_cancer_history": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions ANY family member (immediate or extended) having cancer currently or in past. No if: no family cancer mentions"
                            },
                            "cancer_type": {
                                "type": "string",
                                "enum": ["Thyroid Cancer", "Breast Cancer", "Pancreatic Cancer", "Gyn cancer", "Other", "No Type Mentioned"],
                                "description": "Select first specific cancer type mentioned. Use 'Other' only for clearly specified non-main types"
                            },
                            "other_cancer_type": {
                                "type": "string",
                                "description": "If 'Other' selected above, specify the exact cancer type mentioned"
                            },
                            "is_survivor_weight_loss": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: weight changes after cancer treatment/survival mentioned. No if: general weight discussion or unclear timing"
                            },
                            "cancer_diagnosis_after_medication": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: clear temporal sequence shows cancer diagnosis after starting GLP1. No if: unclear timing or just risk discussion"
                            },
                             "mentions_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: any mention of cancer risk or increased risk, includes family history concerns. Synonyms of cancer risk"
                             },
                            "concerned_about_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: expresses worry/fear about cancer risk, includes family history concerns. No if: purely informational"
                            },
                            "seeking_cancer_risk_data": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: actively asking for studies/research/experiences about cancer risk. No if: sharing info or general discussion"
                            },
                            "can_assess_misinformation": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: makes generalizable claims about medication/cancer. No if: personal experience only or questions"
                            },
                            "sentiment_score": {
                                "type": "integer",
                                "enum": [-1, 0, 1],
                                "description": "-1: fear/worry/grief, 0: factual/balanced, 1: hope/success/gratitude"
                            },
                            "discussed_risk_with_physician": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions discussing cancer risk with ANY healthcare provider or recommending such discussion. No if: no provider discussion mentioned"
                            },
                            "discussion_GLP1_decreasing_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: explicitly mentions GLP1/weight loss reducing cancer risk. No if: no mention of risk reduction"
                            }
                        },
                        "required": [
                            "post_id",
                            "inclusion",
                            "exclusion_reason",
                            "is_survivor",
                            "is_survivor_and_taking_med",
                            "family_cancer_history",
                            "cancer_type",
                            "other_cancer_type",
                            "is_survivor_weight_loss",
                            "cancer_diagnosis_after_medication",
                            "mentions_cancer_risk",
                            "concerned_about_cancer_risk",
                            "seeking_cancer_risk_data",
                            "can_assess_misinformation",
                            "sentiment_score",
                            "discussed_risk_with_physician",
                            "discussion_GLP1_decreasing_cancer_risk"
                        ],
                        "additionalProperties": False
                    }
                }
            }
        )

        print(f"Raw API response: {response.choices[0].message.content}")
        return json.loads(response.choices[0].message.content)

    except Exception as e:
        print(f"Error processing post {post_id}: {str(e)}")
        return None

def process_posts(batch_size=10):
    # Load the JSON data
    with open('prepared_dataset.json', 'r') as file:
        data = json.load(file)

    results = []

    # Process posts with progress bar
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(len(data) + batch_size - 1)//batch_size}")

        for post in batch:
            analysis = analyze_post(post['post_id'], post['body'])
            if analysis:
                analysis['original_post'] = post['body']
                results.append(analysis)

        # Save intermediate results after each batch
        with open('analysis_results.json', 'w') as file:
            json.dump(results, file, indent=2)

    print(f"\nAnalysis complete. Processed {len(results)} out of {len(data)} posts")
    print("Results saved to analysis_results.json")
    return results

if __name__ == "__main__":
    process_posts()

import pandas as pd
import json
import numpy as np

# File paths
csv_file_path = 'Adjudicated_100_dataset_clean.csv'
json_file_path = 'analysis_results.json'
merged_json_path = 'merged_analysis_results.json'

# Load the CSV file, replacing NaN with empty strings
csv_data = pd.read_csv(csv_file_path).replace(np.nan, "", regex=True)

# Load the JSON file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Add .llm suffix to keys except 'original_post' and 'post_id'
modified_json_data = []
for post in json_data:
    modified_post = {}
    for key, value in post.items():
        if key not in ['original_post', 'post_id']:
            modified_post[f"{key}.llm"] = value
        else:
            modified_post[key] = value
    modified_json_data.append(modified_post)

# Standardize text for matching
csv_data['body_cleaned'] = csv_data['body'].str.lower().str.strip()
for post in modified_json_data:
    post['original_post_cleaned'] = post['original_post'].lower().strip()

# Create a dictionary from the CSV for quick lookup
csv_lookup = csv_data.set_index('body_cleaned').to_dict(orient='index')

# Merge CSV data into JSON based on matching text
for post in modified_json_data:
    match_data = csv_lookup.get(post['original_post_cleaned'])
    if match_data:
        post.update(match_data)

# Clean up temporary fields
for post in modified_json_data:
    post.pop('original_post_cleaned', None)
csv_data = csv_data.drop(columns=['body_cleaned'])

# Save the merged data
with open(merged_json_path, 'w') as file:
    json.dump(modified_json_data, file, indent=2)

print(f"Merged data saved to {merged_json_path}")

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

def calculate_metrics(y_true, y_pred):
    """Calculate metrics for categorical variables"""
    try:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        return accuracy, precision, recall, f1
    except:
        return None, None, None, None

# Load the JSON data
with open('merged_analysis_results.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Initialize results dictionary
metrics_results = {}
mismatches = []
mismatch_counts = {}

# Get all pairs of corresponding columns (llm and human)
llm_cols = [col for col in df.columns if col.endswith('.llm')]
human_cols = [col.replace('.llm', '.human') for col in llm_cols]

# First calculate metrics for inclusion
inclusion_metrics = calculate_metrics(
    df['inclusion.human'],
    df['inclusion.llm']
)
metrics_results['inclusion'] = {
    'Accuracy': inclusion_metrics[0],
    'Precision': inclusion_metrics[1],
    'Recall': inclusion_metrics[2],
    'F1': inclusion_metrics[3]
}

# Find inclusion mismatches
inclusion_mismatches = df[df['inclusion.llm'] != df['inclusion.human']]
mismatch_counts['inclusion'] = len(inclusion_mismatches)
for _, row in inclusion_mismatches.iterrows():
    mismatches.append({
        'post_id': row['post_id'],
        'variable': 'inclusion',
        'llm_value': row['inclusion.llm'],
        'human_value': row['inclusion.human'],
        'original_text': row['original_post']
    })

# Filter for posts where inclusion.human is "Yes"
included_df = df[df['inclusion.human'] == 'Yes']

# Calculate metrics for other variables only on included posts
for llm_col, human_col in zip(llm_cols, human_cols):
    if human_col in df.columns and 'inclusion' not in llm_col:
        # Skip sentiment score as it's numerical
        if 'sentiment_score' in llm_col:
            continue

        # Get valid rows (non-empty in both columns)
        valid_mask = (included_df[llm_col].notna()) & (included_df[human_col].notna())
        y_true = included_df[human_col][valid_mask]
        y_pred = included_df[llm_col][valid_mask]

        # Calculate metrics
        accuracy, precision, recall, f1 = calculate_metrics(y_true, y_pred)

        # Store results
        base_col = llm_col.replace('.llm', '')
        metrics_results[base_col] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }

        # Find mismatches
        mismatch_mask = (included_df[llm_col] != included_df[human_col]) & valid_mask
        mismatch_df = included_df[mismatch_mask][['post_id', llm_col, human_col, 'original_post']]

        # Count mismatches for this variable
        mismatch_counts[base_col] = len(mismatch_df)

        for _, row in mismatch_df.iterrows():
            mismatches.append({
                'post_id': row['post_id'],
                'variable': base_col,
                'llm_value': row[llm_col],
                'human_value': row[human_col],
                'original_text': row['original_post']
            })

# Create results DataFrame
metrics_df = pd.DataFrame(metrics_results).T
metrics_df['Mismatches'] = pd.Series(mismatch_counts)
metrics_df = metrics_df.round(3)

# Create mismatches DataFrame
mismatches_df = pd.DataFrame(mismatches)

# Save results
metrics_df.to_csv('metrics_summary.csv')
mismatches_df.to_csv('mismatches.csv', index=False)

# Print summary
print("\nMetrics Summary:")
print(metrics_df)

print("\nMismatch Counts per Variable:")
for var, count in mismatch_counts.items():
    print(f"{var}: {count} mismatches")

print("\nTotal number of mismatches:", len(mismatches))
print("Included posts:", len(included_df))
print("Total posts:", len(df))


Processing batch 1 of 10
Analyzing post_id: 0
body (first 100 chars): My husband had papillary thyroid cancer and he takes Wegovy....
Raw API response: {"post_id":"0","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"Yes","cancer_type":"Thyroid Cancer","other_cancer_type":"","is_survivor_weight_loss":"No","cancer_diagnosis_after_medication":"No","mentions_cancer_risk":"No","concerned_about_cancer_risk":"No","seeking_cancer_risk_data":"No","can_assess_misinformation":"No","sentiment_score":0,"discussed_risk_with_physician":"No","discussion_GLP1_decreasing_cancer_risk":"No"}
Analyzing post_id: 1
body (first 100 chars): How many drugs have noted dangerous side effects. From giving you cancer to heart attacks. I am pret...
Raw API response: {"post_id":"1","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"No","cancer_type":"No Type Mentioned","other_cancer

In [None]:
import json
from openai import OpenAI
from tqdm.notebook import tqdm

def analyze_post(post_id, post_body):
    print(f"Analyzing post_id: {post_id}")
    print(f"body (first 100 chars): {post_body[:100]}...")

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
               {
                    "role": "system",
                    "content": """You are analyzing Reddit posts about GLP1 medications and cancer discussions. Follow these detailed guidelines for each variable:

INCLUSION CRITERIA:
- Include ANY mention of cancer (personal, family, risk, research, screening, side effects)
- Include even general cancer discussions or animal studies
- Only exclude if cancer is purely metaphorical or completely absent
- Default to inclusion if uncertain
- If a post is a link, exclude it
Example YES: "There is a risk of cancer, be aware", "everything around us increases our risk for cancer, talk to a physician", "My doctor told me ozempic does not cause cancer"
Example NO: "Congrats, cancer queen", "Best wishes for getting treating for cancer"

SURVIVOR IDENTIFICATION:
- Look for past tense cancer references
- Consider treatment completion indicators
- Check for remission/recovery mentions
- Default to NO unless clear evidence
- If the post hints at a surgery that happened secondary to cancer, include it for example "thyroidectomy due to cancer"
- If the post hints at a cancer history include it for example "Cancer, Diabetes here and im trying to lose weight"
Example YES: "after my cancer treatment", "been cancer-free"
Example NO: "worried about cancer risk", "family history"

SURVIVOR MEDICATION STATUS:
- Check both survivor status AND current medication use
- Must confirm both survival and active medication use
- Consider indirect mentions of current medication use
- Default to NO if either criterion unclear
Example YES: "I'm a cancer survivor now taking Wegovy", "After beating cancer, I started Ozempic", "taking it after going through cancer treatment", "started the meds after cancer", "they put me on it after the weight gain due to my treatment for cancer"
Example NO: "cancer survivor but haven't started yet", "taking meds but no cancer history",

FAMILY CANCER HISTORY:
- Look for mentions of cancer in family members
- Consider all types of family references (parents, siblings, aunts/uncles, etc.)
- Include both current and past family cancer cases
- Default to NO if unclear
Example YES: "cancer runs in my family", "my mom had cancer", "sister's breast cancer"
Example NO: "general cancer discussion", "friend had cancer" "spouse has cancer"

CANCER TYPE CATEGORIZATION:
- Choose first specific cancer mentioned
- Map clinical terms to main categories
- Only use "Other" for clearly specified non-main types
- Use "No Type Mentioned" for general discussions
- If risk mentioned that is associated with a cancer then select the cancer type, Example: "risk of breast cancer" -> "Breast Cancer"
Example mapping examples:
- Thyroid: thyroid carcinoma, thyroid tumors, "could lead to thyroid cancer", medullary thyroid, papillary thyroid
- Breast: mammary carcinoma
- Gyn: endometrial, ovarian, cervical
- Other: Blood cancer, anal cancer

OTHER CANCER TYPE:
- for if cancer_type is other then map this to a disease site instead of the actual histology
Example: "Osteosarcoma -> bone cancer", "leukemia -> blood cancer", "colon cancer -> GI cancer"

SURVIVOR WEIGHT PATTERNS:
- Look for temporal relationship (cancer then weight change)
- If trying to lose weight after a cancer diagnosis, include it
- If they had cancer in the past and now losing weight, include it
- Consider treatment-related changes
- Default to NO if timing unclear
Example YES: "losing treatment weight", "now trying to lose all this weight after cancer treatment"
Example NO: "general weight issues", "family member's weight"

POST-MEDICATION CANCER:
- Require clear temporal sequence
- Look for specific timeframes
- Consider diagnostic language
- Default to NO if timing unclear
Example YES: "diagnosed 6 months after starting", "developed while on"
Example NO: "had cancer before", "worried about risk"

MENTIONS OF CANCER RISK:
- Include if post has any mention of cancer risk with GLP1 meds
- Look for words synonmous with risk relating to cancer or malignancy
- Include synonyms for risk (chance, possibility, likelihood, probability)
- Include monitoring for potential development
- Include of discussion is not in humans like rats
- Default to NO if unclear
Examples YES: "risk of developing cancer", "chance of malignancy"

RISK CONCERNS:
- Include both direct and indirect worry
- Consider family history context
- Look for anxiety/fear language
- Default to NO if purely informational or not seeking information or communicating risk of cancer
- If asking for cancer history being excluding then they are concerned
Example YES: "scared of cancer risk", "worried about family history"
Example NO: "sharing research", "general discussion", "Cancer from ozempic is unproven"

SEEKING INFORMATION RELATED TO CANCER RISKS:
- Include if the post mentions more information related to cancer risk
- Example Yes: "Does anyone know if the risk is significant?", "I was told the only risk is in rats"
- Example No: "sharing research", "general discussion", "Cancer from ozempic is unproven"

SENTIMENT SCORING:
-1: Fear, worry, anger, grief
 0: Facts, balanced views, mixed feelings
+1: Hope, success, gratitude, support
Example -1: "terrified of cancer risk"
Example 0: "discussing research findings"
Example +1: "grateful for successful treatment"

PHYSICIAN DISCUSSION:
- Include any discusion with healthcare provider related to cancer risk and treatment
- Consider both direct and recommended discussions
- Look for clinical assessment mentions
- Default to NO if unclear, or just mentions talking to a physician without cancer risk discussion (can be implied too)
- If the post is recommending someone speaks with their physician, include it
Example YES: "doctor discussed risks", "oncologist advised"
Example NO: "should ask doctor", "general medical info"

DISCUSSION OF DECREASED RISK:
- If a post hints that due to glp1 medication there is a DECREASED risk of cancer, include it
- If a post hints that glp1 decreases weight, weight decreases cancer risk, include it
- If a post hints that glp1 increases cancer risk, exclude it
- If no clear logical link between Decrease of cancer risk and the glp1 medication or weight loss, exclude it
- If mentions decreased recurrence rate, include it


When uncertain between options, use these defaults:
- Inclusion: Include if any cancer mention
- Survivor Status: No if unclear
- Survivor Medication Status: No if unclear
- Family Cancer History: No if unclear
- Family History: No if unclear
- Cancer Type: No Type Mentioned if unclear
- Weight Changes: No if timing unclear
- Post-Med Cancer: No if sequence unclear
- Risk Concerns: No if not explicit
- Sentiment: 0 if mixed/unclear
- Physician Discussion: No if not explicit

# EXAMPLE AND EDGE CASES TO GUIDE YOUR EVALUATION

        Example Post 1: "I read somewhere that Ozempic definitely causes thyroid cancer in everyone who takes it. Don't let big pharma fool you."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "Yes",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: Makes absolute claims without evidence about cancer causation. Valid cancer mention but spreads misinformation. Not expressing personal concern.

        Example Post 2: "My doctors told me the thyroid cancer risk was only seen in rats and based on careful discussion of my personal medical history, we decided the benefits outweighed the potential risks."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "No",
            "discussed_risk_with_physician": "Yes"
        }
        Reasoning: Personal medical discussion with evidence of physician consultation about cancer risks. Shows consideration of personal risk without making general claims.

        Example Post 3: "Started Mounjaro and lost 50 pounds! I feel like I'm beating cancer before it starts."
        {
            "inclusion": "No",
            "exclusion_reason": "Cancer used metaphorically",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "No"
        }
        Reasoning: Cancer mention is purely metaphorical, not discussing actual cancer risk or experience. Personal experience only.

        Example Post 4: "Just sharing research findings: our hospital database shows zero cancer cases from GLP1s, proving they're completely safe."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "Yes",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: Makes broad safety claims and generalizations beyond the data. Not expressing personal concern but making definitive claims about safety.

        Example Post 5: "My aunt got cancer six months after starting Ozempic. Anyone else experience this? I'm terrified to start."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "family_cancer_history": "Yes",
            "cancer_diagnosis_after_medication": "Yes"
        }
        Reasoning: Clear temporal relationship between medication and cancer diagnosis in family member. Shows personal concern and seeking information.

        Example Post 6: "Looking at cancer rates in the general population vs GLP1 users according to latest studies [link]"
        {
            "inclusion": "Yes",
            "exclusion_reason": "General cancer discussion without specific connection to medication effects",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: General discussion of statistical information without personal context or claims. Sharing research without assessment.

        Example Post 7: "I filled my prescription but I'm kind of freaked out about the pancreatitis/gall bladder, and thyroid cancer warnings. Are these things I would need to worry about for the rest of my life?!"
        {
            "cancer_type": "Thyroid Cancer",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "sentiment_score": -1
        }
        Reasoning: Explicitly mentions thyroid cancer warning and shows anxiety about long-term risks. Asking for information about duration of risk.

        Example Post 8: "Man you look awesome. Congrats to losing all that weight and beating cancer"
        {
            "is_survivor": "Yes",
            "cancer_type": "No Type Mentioned",
            "sentiment_score": 1,
            "is_survivor_weight_loss": "Yes"
        }
        Reasoning: Clear reference to having beaten cancer (past tense), with positive sentiment about weight loss post-cancer.

        Example Post 9: "I've had consistent diarrhea for years after having colon cancer and a couple of bowel surgeries... Anyway, the problem with Ozempic diarrhea, for me, is the surprise factor."
        {
            "is_survivor": "Yes",
            "is_survivor_and_taking_med": "Yes",
            "cancer_type": "Other",
            "other_cancer_type": "GI cancer",
            "sentiment_score": 0
        }
        Reasoning: Clearly indicates past colon cancer and current Ozempic use, providing detailed personal experience.

        Example Post 10: "Breast cancer risk is common, unfortunately. But even here, being obese increases your risk more than the pill... I try not to stress about the ones I have no control over and focus on what I realistically can impact."
        {
            "cancer_type": "Breast Cancer",
            "is_survivor_weight_loss": "Yes",
            "concerned_about_cancer_risk": "No",
            "can_assess_misinformation": "No",
            "sentiment_score": 0
        }
        Reasoning: Educational discussion about risk factors, not expressing personal concern but sharing factual information about risk management.

        Example Post 11: "And for those people who told you they got cancer 'shortly after' starting Ozempic, what does 'shortly after' means?"
        {
            "cancer_diagnosis_after_medication": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "can_assess_misinformation": "No",
            "sentiment_score": 0
        }
        Reasoning: Discussing temporal relationship between medication and cancer diagnosis, seeking clarification about timing.

        Example Post 12: "That's wonderful, my hystersister! I had a full hysterectomy with bilateral saplingo-oopherectomy for my 1A endometrial cancer... your weight loss will actually help lower your estrogen-related cancer risk!"
        {
            "is_survivor": "Yes",
            "cancer_type": "Gyn Cancer",
            "discussion_GLP1_decreasing_cancer_risk": "Yes",
            "sentiment_score": 1,
            "concerned_about_cancer_risk": "No"
        }
        Reasoning: Past gynecological cancer mentioned, discussing how weight loss reduces cancer risk, positive sentiment.

        Example Post 13: "So a father having papillary cancer wouldn't prohibit me from being on a GPL1?"
        {
            "family_cancer_history": "Yes",
            "cancer_type": "Thyroid Cancer",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "sentiment_score": -1
        }
        Reasoning: Mentions family history of papillary thyroid cancer, seeking information about medication safety in this context, showing concern.
        Example Post 14: "GLP1s definitely cause cancer in everyone who takes them, just like they did in the rat studies. This is proven fact."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Makes absolute claims about causation without evidence, extends animal studies to humans inappropriately.

        Example Post 15: "When I took Ozempic, I didn't get cancer. My doctor said the risk was low for me."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Personal experience only, not making generalizable claims. Referencing medical advice.

        Example Post 16: "According to recent studies, thyroid cancer risk from GLP1s is completely nonexistent in humans since we lack the specific receptors found in rats."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Makes definitive claims about biological mechanisms and risk, going beyond available evidence.

        Example Post 17: "Got diagnosed with pancreatic cancer 3 months after starting. Doctor isn't sure if there's any connection."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Pancreatic Cancer"
        }
        Reasoning: Sharing personal experience without making broader claims about causation.

        Example Post 18: "The scientific consensus is clear - these drugs have zero cancer risk and anyone who says otherwise is spreading fear."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Makes sweeping claims about scientific consensus and risk, dismissing valid concerns.

        Example Post 19: "My endocrinologist monitors my thyroid closely because of the medication's warning label, but she said medullary thyroid cancer risk is only in rats."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Reporting medical professional's guidance without making generalizable claims.

        Example Post 20: "These medications actually prevent cancer by reducing obesity. The thyroid warnings are just legal protection."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Makes unsupported claims about cancer prevention and dismisses established warnings.

        Example Post 21: "I read the research papers on breast cancer risk. Still discussing with my doctor whether it applies to my situation."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Breast Cancer"
        }
        Reasoning: Discussing personal risk assessment without making broader claims.

        Example Post 22: "Real talk - colon cancer runs in my family, but my doctor says obesity is a bigger risk factor than GLP1s."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Other",
            "other_cancer_type": "GI cancer"
        }
        Reasoning: Sharing medical guidance and personal context without generalizing.

        Example Post 23: "All studies prove that endometrial cancer risk goes down with weight loss from these meds. It's 100% safe for everyone."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Gyn Cancer"
        }
        Reasoning: Overgeneralizes research findings and makes absolute safety claims.
        """


                },
                {
                    "role": "user",
                    "content": f"Analyze this post:\n\nPost ID: {post_id}\n\nPost Body: {post_body}"
                }
            ],
            temperature=0.0,
            max_tokens=256,
            top_p=0.1,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "cancer_post_analysis_limited",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "post_id": {"type": "string"},
                            "inclusion": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Include if ANY cancer mention (personal/family/risk/research/screening/side effects). Only exclude if cancer is metaphorical or absent."
                            },
                            "exclusion_reason": {
                                "type": "string",
                                "description": "If excluded, specify: 'No cancer mention', 'Cancer used metaphorically', or detailed reason"
                            },
                            "is_survivor": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: past cancer treatment, remission, recovery, post-cancer experience. No if: current diagnosis, risk discussion, family history"
                            },
                            "is_survivor_and_taking_med": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: confirmed cancer survivor AND currently taking GLP1 medication. Must have evidence of both survival and current medication use. No if: missing either criterion"
                                },
                            "family_cancer_history": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions ANY family member (immediate or extended) having cancer currently or in past. No if: no family cancer mentions"
                            },
                            "cancer_type": {
                                "type": "string",
                                "enum": ["Thyroid Cancer", "Breast Cancer", "Pancreatic Cancer", "Gyn cancer", "Other", "No Type Mentioned"],
                                "description": "Select first specific cancer type mentioned. Use 'Other' only for clearly specified non-main types"
                            },
                            "other_cancer_type": {
                                "type": "string",
                                "description": "If 'Other' selected above, specify the exact cancer type mentioned"
                            },
                            "is_survivor_weight_loss": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: weight changes after cancer treatment/survival mentioned. No if: general weight discussion or unclear timing"
                            },
                            "cancer_diagnosis_after_medication": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: clear temporal sequence shows cancer diagnosis after starting GLP1. No if: unclear timing or just risk discussion"
                            },
                             "mentions_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: any mention of cancer risk or increased risk, includes family history concerns. Synonyms of cancer risk"
                             },
                            "concerned_about_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: expresses worry/fear about cancer risk, includes family history concerns. No if: purely informational"
                            },
                            "seeking_cancer_risk_data": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: actively asking for studies/research/experiences about cancer risk. No if: sharing info or general discussion"
                            },
                            "can_assess_misinformation": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: makes generalizable claims about medication/cancer. No if: personal experience only or questions"
                            },
                            "sentiment_score": {
                                "type": "integer",
                                "enum": [-1, 0, 1],
                                "description": "-1: fear/worry/grief, 0: factual/balanced, 1: hope/success/gratitude"
                            },
                            "discussed_risk_with_physician": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions discussing cancer risk with ANY healthcare provider or recommending such discussion. No if: no provider discussion mentioned"
                            },
                            "discussion_GLP1_decreasing_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: explicitly mentions GLP1/weight loss reducing cancer risk. No if: no mention of risk reduction"
                            }
                        },
                        "required": [
                            "post_id",
                            "inclusion",
                            "exclusion_reason",
                            "is_survivor",
                            "is_survivor_and_taking_med",
                            "family_cancer_history",
                            "cancer_type",
                            "other_cancer_type",
                            "is_survivor_weight_loss",
                            "cancer_diagnosis_after_medication",
                            "mentions_cancer_risk",
                            "concerned_about_cancer_risk",
                            "seeking_cancer_risk_data",
                            "can_assess_misinformation",
                            "sentiment_score",
                            "discussed_risk_with_physician",
                            "discussion_GLP1_decreasing_cancer_risk"
                        ],
                        "additionalProperties": False
                    }
                }
            }
        )

        print(f"Raw API response: {response.choices[0].message.content}")
        return json.loads(response.choices[0].message.content)

    except Exception as e:
        print(f"Error processing post {post_id}: {str(e)}")
        return None

def process_posts(batch_size=10):
    # Load the JSON data
    with open('prepared_dataset.json', 'r') as file:
        data = json.load(file)

    results = []

    # Process posts with progress bar
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(len(data) + batch_size - 1)//batch_size}")

        for post in batch:
            analysis = analyze_post(post['post_id'], post['body'])
            if analysis:
                analysis['original_post'] = post['body']
                results.append(analysis)

        # Save intermediate results after each batch
        with open('analysis_results.json', 'w') as file:
            json.dump(results, file, indent=2)

    print(f"\nAnalysis complete. Processed {len(results)} out of {len(data)} posts")
    print("Results saved to analysis_results.json")
    return results

if __name__ == "__main__":
    process_posts()

import pandas as pd
import json
import numpy as np

# File paths
csv_file_path = 'Adjudicated_100_dataset_clean.csv'
json_file_path = 'analysis_results.json'
merged_json_path = 'merged_analysis_results.json'

# Load the CSV file, replacing NaN with empty strings
csv_data = pd.read_csv(csv_file_path).replace(np.nan, "", regex=True)

# Load the JSON file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Add .llm suffix to keys except 'original_post' and 'post_id'
modified_json_data = []
for post in json_data:
    modified_post = {}
    for key, value in post.items():
        if key not in ['original_post', 'post_id']:
            modified_post[f"{key}.llm"] = value
        else:
            modified_post[key] = value
    modified_json_data.append(modified_post)

# Standardize text for matching
csv_data['body_cleaned'] = csv_data['body'].str.lower().str.strip()
for post in modified_json_data:
    post['original_post_cleaned'] = post['original_post'].lower().strip()

# Create a dictionary from the CSV for quick lookup
csv_lookup = csv_data.set_index('body_cleaned').to_dict(orient='index')

# Merge CSV data into JSON based on matching text
for post in modified_json_data:
    match_data = csv_lookup.get(post['original_post_cleaned'])
    if match_data:
        post.update(match_data)

# Clean up temporary fields
for post in modified_json_data:
    post.pop('original_post_cleaned', None)
csv_data = csv_data.drop(columns=['body_cleaned'])

# Save the merged data
with open(merged_json_path, 'w') as file:
    json.dump(modified_json_data, file, indent=2)

print(f"Merged data saved to {merged_json_path}")

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

def calculate_metrics(y_true, y_pred):
    """Calculate metrics for categorical variables"""
    try:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        return accuracy, precision, recall, f1
    except:
        return None, None, None, None

# Load the JSON data
with open('merged_analysis_results.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Initialize results dictionary
metrics_results = {}
mismatches = []
mismatch_counts = {}

# Get all pairs of corresponding columns (llm and human)
llm_cols = [col for col in df.columns if col.endswith('.llm')]
human_cols = [col.replace('.llm', '.human') for col in llm_cols]

# First calculate metrics for inclusion
inclusion_metrics = calculate_metrics(
    df['inclusion.human'],
    df['inclusion.llm']
)
metrics_results['inclusion'] = {
    'Accuracy': inclusion_metrics[0],
    'Precision': inclusion_metrics[1],
    'Recall': inclusion_metrics[2],
    'F1': inclusion_metrics[3]
}

# Find inclusion mismatches
inclusion_mismatches = df[df['inclusion.llm'] != df['inclusion.human']]
mismatch_counts['inclusion'] = len(inclusion_mismatches)
for _, row in inclusion_mismatches.iterrows():
    mismatches.append({
        'post_id': row['post_id'],
        'variable': 'inclusion',
        'llm_value': row['inclusion.llm'],
        'human_value': row['inclusion.human'],
        'original_text': row['original_post']
    })

# Filter for posts where inclusion.human is "Yes"
included_df = df[df['inclusion.human'] == 'Yes']

# Calculate metrics for other variables only on included posts
for llm_col, human_col in zip(llm_cols, human_cols):
    if human_col in df.columns and 'inclusion' not in llm_col:
        # Skip sentiment score as it's numerical
        if 'sentiment_score' in llm_col:
            continue

        # Get valid rows (non-empty in both columns)
        valid_mask = (included_df[llm_col].notna()) & (included_df[human_col].notna())
        y_true = included_df[human_col][valid_mask]
        y_pred = included_df[llm_col][valid_mask]

        # Calculate metrics
        accuracy, precision, recall, f1 = calculate_metrics(y_true, y_pred)

        # Store results
        base_col = llm_col.replace('.llm', '')
        metrics_results[base_col] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }

        # Find mismatches
        mismatch_mask = (included_df[llm_col] != included_df[human_col]) & valid_mask
        mismatch_df = included_df[mismatch_mask][['post_id', llm_col, human_col, 'original_post']]

        # Count mismatches for this variable
        mismatch_counts[base_col] = len(mismatch_df)

        for _, row in mismatch_df.iterrows():
            mismatches.append({
                'post_id': row['post_id'],
                'variable': base_col,
                'llm_value': row[llm_col],
                'human_value': row[human_col],
                'original_text': row['original_post']
            })

# Create results DataFrame
metrics_df = pd.DataFrame(metrics_results).T
metrics_df['Mismatches'] = pd.Series(mismatch_counts)
metrics_df = metrics_df.round(3)

# Create mismatches DataFrame
mismatches_df = pd.DataFrame(mismatches)

# Save results
metrics_df.to_csv('metrics_summary.csv')
mismatches_df.to_csv('mismatches.csv', index=False)

# Print summary
print("\nMetrics Summary:")
print(metrics_df)

print("\nMismatch Counts per Variable:")
for var, count in mismatch_counts.items():
    print(f"{var}: {count} mismatches")

print("\nTotal number of mismatches:", len(mismatches))
print("Included posts:", len(included_df))
print("Total posts:", len(df))


Processing batch 1 of 10
Analyzing post_id: 0
body (first 100 chars): My husband had papillary thyroid cancer and he takes Wegovy....
Raw API response: {"post_id":"0","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"Yes","cancer_type":"Thyroid Cancer","other_cancer_type":"","is_survivor_weight_loss":"No","cancer_diagnosis_after_medication":"No","mentions_cancer_risk":"No","concerned_about_cancer_risk":"No","seeking_cancer_risk_data":"No","can_assess_misinformation":"No","sentiment_score":0,"discussed_risk_with_physician":"No","discussion_GLP1_decreasing_cancer_risk":"No"}
Analyzing post_id: 1
body (first 100 chars): How many drugs have noted dangerous side effects. From giving you cancer to heart attacks. I am pret...
Raw API response: {"post_id":"1","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"No","cancer_type":"No Type Mentioned","other_cancer

# Chain of thought + 10-shot iterated

In [None]:
import json
from openai import OpenAI
from tqdm.notebook import tqdm

def analyze_post(post_id, post_body):
    print(f"Analyzing post_id: {post_id}")
    print(f"body (first 100 chars): {post_body[:100]}...")

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
               {
                    "role": "system",
                    "content": """You are analyzing Reddit posts about GLP1 medications and cancer discussions. Follow these detailed guidelines for each variable:

INCLUSION CRITERIA:
- Include ANY mention of cancer (personal, family, risk, research, screening, side effects)
- Include even general cancer discussions or animal studies
- Only exclude if cancer is purely metaphorical or completely absent
- Default to inclusion if uncertain
- If a post is a link, exclude it
- Consider any mention of cancer as relevant for inclusion, even if peripheral or in congratulatory context. The threshold for inclusion should be the presence of cancer-related terms rather than the centrality of cancer to the message.
Example YES: "There is a risk of cancer, be aware", "everything around us increases our risk for cancer, talk to a physician", "My doctor told me ozempic does not cause cancer"
Example NO: "Congrats, cancer queen", "Best wishes for getting treating for cancer"

SURVIVOR IDENTIFICATION:
- Look for past tense cancer references
- Consider treatment completion indicators
- Check for remission/recovery mentions
- Default to NO unless clear evidence
- If the post hints at a surgery that happened secondary to cancer, include it for example "thyroidectomy due to cancer"
- If the post hints at a cancer history include it for example "Cancer, Diabetes here and im trying to lose weight"
- Look for both explicit and implicit indicators of survivor status, including context clues like "had cancer", past tense references to treatment, or mentions of being cancer-free. Do not require explicit confirmation of completed treatment or remission.
Example YES: "after my cancer treatment", "been cancer-free"
Example NO: "worried about cancer risk", "family history"

SURVIVOR AND MEDICATION STATUS:
Current Evidence Requirements:
1. Survivor Evidence (ANY of these):
   - Direct statements about past cancer
   - Treatment completion references
   - Current monitoring/checkups related to cancer
   - References to cancer as part of medical history

2. Current Medication Evidence (ANY of these):
   - Direct mentions of currently taking GLP1s
   - References to current dose/dosage
   - Discussion of current side effects
   - Mentions of recent start or continuation

3. Connection Rules:
   - Both pieces of evidence can appear anywhere in the post
   - Time references like "now", "currently", "since starting" suggest active medication use
   - Past cancer + present tense medication discussion = YES
   - If either piece is unclear or missing = NO

Example YES: "Had thyroid cancer last year. Started Ozempic 3 months ago"
Example YES: "Cancer survivor here. My current Wegovy dose is 0.5mg"
Example NO: "Had cancer but thinking about starting GLP1s"

FAMILY CANCER HISTORY:
- Look for mentions of cancer in family members
- Consider all types of family references (parents, siblings, aunts/uncles, etc.)
- Include both current and past family cancer cases
- Default to NO if unclear
Example YES: "cancer runs in my family", "my mom had cancer", "sister's breast cancer"
Example NO: "general cancer discussion", "friend had cancer" "spouse has cancer"

**CANCER TYPE IDENTIFICATION** (IMPORTANT)
- Choose first specific cancer mentioned
- Map clinical terms to main categories
- Only use "Other" for clearly specified non-main types
- Use "No Type Mentioned" for general discussions
- Historically you made mistakes labelling "Thyroid Cancer" as "No Type Mentioned", so think about that well
- If negation is used, still use the disease type mentioned
- If risk mentioned that is associated with a cancer then select the cancer type, Example: "risk of breast cancer" -> "Breast Cancer"
- When a text mentions:
-   **Monitoring or Testing**: If a doctor is monitoring a specific organ due to medication risks, associate it with the cancer type of that organ.
-   **Medication Risks**: If certain medications have known risks for specific cancers, link accordingly.
-   **Family History or Genetic Risks**: Consider these as indicators for the associated cancer type.

Always classify the cancer type if there's a reasonable association, even if not explicitly stated.

Example mapping examples:
- Thyroid: thyroid carcinoma, thyroid tumors, "could lead to thyroid cancer", medullary thyroid, papillary thyroid
- Breast: mammary carcinoma
- Gyn: endometrial, ovarian, cervical
- Other: Blood cancer, anal cancer

Text: "My doctor monitors my thyroid due to medication risks"
Analysis: Monitoring the thyroid due to medication risks implies a concern for thyroid cancer.
Classification: Thyroid Cancer

Text: "Had a hysterectomy due to cancer"
Analysis: "Hysterectomy" is a surgical procedure to remove the uterus, associated with GYN cancers.
Classification: GYN Cancer

Text: "GLP1 do not cause thyroid cancer"
Analysis: "The type of cancer being mentioned here is thyroid cancer"
Classification: Thyroid Cancer

- for if cancer_type is other then map this to a disease site instead of the actual histology
Example:
Text: "I have Osteosarcoma"
Classification: cancer_type": "Other",
    "other_cancer_type": "Bone Cancer"

Text: "I have leukeumia or lymphoma"
Classification: cancer_type": "Other",
    "other_cancer_type": "Blood Cancer"

Text: "I have anal cancer", "I have colon cancer"
Classification: cancer_type": "Other",
    "other_cancer_type": "GI Cancer"

SURVIVOR WEIGHT PATTERNS:
- Look for temporal relationship (cancer then weight change)
- If trying to lose weight after a cancer diagnosis, include it
- If they had cancer in the past and now losing weight, include it
- Consider treatment-related changes
- Default to NO if timing unclear
Example YES: "losing treatment weight", "now trying to lose all this weight after cancer treatment"
Example NO: "general weight issues", "family member's weight"

POST-MEDICATION CANCER:
1. Look for Sequence Indicators:
   - Direct timing: "after starting", "since taking", "months into"
   - Narrative order: "started medication... then found"
   - Current investigations: "now being checked", "started having symptoms"

2. Evidence Categories:
   Strong YES:
   - Explicit timing statements
   - Clear sequence of medication start then diagnosis
   - Current medical investigations after medication start

   Classify NO:
   - Pre-existing cancer
   - No clear temporal relationship
   - Cancer history before medication

   Use NA:
   - No cancer diagnosis mentioned
   - No medication timing discussed

Example YES: "developed symptoms after starting", "being checked for cancer since starting"
Example NO: "had cancer before starting", "unrelated timing"
Example NA: "worried about cancer risk"

MENTIONS OF CANCER RISK:
1. Risk Reference Types:
   - Direct risk statements
   - Prevention discussions
   - Safety concerns
   - Family history context
   - Research/study references
   - Warning labels/side effects

2. Risk Context Indicators:
   - Words: "risk", "chance", "possibility", "likelihood"
   - Prevention terms: "avoid", "prevent", "reduce"
   - Concerns: "worried about", "fear of"
   - Medical context: "screening", "monitoring", "check-ups"

Example Inclusions:
- Black box warnings
- Family history discussions
- Prevention strategies
- Risk factors
- Medical monitoring

RISK CONCERNS:
1. Emotional Indicators:
   - Direct worry expressions: "concerned", "worried", "scared"
   - Indirect concern: "don't want to risk", "better safe than sorry"
   - Question patterns: "what if", "could it cause"
   - Hesitation markers: "nervous about", "unsure"

2. Context Clues:
   - Seeking reassurance
   - Asking about experiences
   - Discussing preventive measures
   - Expressing uncertainty
   - Personal risk factors mentioned

3. Distinguish from:
   - General risk discussion without personal concern
   - Factual statements about risk
   - Others' concerns
   - Historical risk discussions

Example YES: "worried about the cancer risk", "scared to start because of cancer history"
Example NO: "cancer risk exists but I'm not concerned", "just stating the facts about risk"

SEEKING INFORMATION RELATED TO CANCER RISKS:
- Include if the post mentions more information related to cancer risk
- Example Yes: "Does anyone know if the risk is significant?", "I was told the only risk is in rats"
- Example No: "sharing research", "general discussion", "Cancer from ozempic is unproven"
1. Direct Information Seeking:
   - Questions about risks/studies
   - Requests for data/research
   - Asking for experiences
   - Looking for statistics

2. Indirect Information Seeking:
   - Expressions of uncertainty needing clarification
   - Research mentions: "been googling", "reading about"
   - Asking for advice about risk
   - Seeking validation of information

Example YES:
- "Does anyone know the cancer risk?"
- "Looking for studies about..."
- "Been researching but can't find..."
- "Anyone else worried about...and know more?"

Example NO:
- Sharing known information
- Stating facts without questions
- Personal experiences without queries

SENTIMENT SCORING:
-1: Fear, worry, anger, grief
 0: Facts, balanced views, mixed feelings
+1: Hope, success, gratitude, support
Example -1: "terrified of cancer risk"
Example 0: "discussing research findings"
Example +1: "grateful for successful treatment"

PHYSICIAN DISCUSSION:
- Include any discusion with healthcare provider related to cancer risk and treatment
- Consider both direct and recommended discussions
- Look for clinical assessment mentions
- Default to NO if unclear, or just mentions talking to a physician without cancer risk discussion (can be implied too)
- If the post is recommending someone speaks with their physician, include it
Example YES: "doctor discussed risks", "oncologist advised"
Example NO: "should ask doctor", "general medical info"


DISCUSSION OF DECREASED RISK:
- If a post hints that due to glp1 medication there is a DECREASED risk of cancer, include it
- If a post hints that glp1 decreases weight, weight decreases cancer risk, include it
- If a post hints that glp1 increases cancer risk, exclude it
- If no clear logical link between Decrease of cancer risk and the glp1 medication or weight loss, exclude it
- If mentions decreased recurrence rate, include it
1. Direct Reduction Mentions:
   - Explicit statements about decreased risk
   - Prevention benefits
   - Protective effects
   - Risk mitigation through medication

2. Indirect Reduction References:
   - Weight loss leading to lower cancer risk
   - Metabolic improvements reducing risk
   - General health benefits including cancer prevention
   - Comparison of risks (obesity vs. medication)

3. Context Clues:
   - Discussion of preventive benefits
   - Long-term health improvements
   - Risk factor reduction
   - Protective mechanisms

Example YES:
- "GLP1s help reduce cancer risk"
- "The medication helps prevent..."
- "Lower risk through weight loss"
- "Protective effects against cancer"

Example NO:
- General risk discussions
- Side effect concerns
- Unrelated cancer mentions
- Risk increases


When uncertain between options, use these defaults:
- Inclusion: Include if any cancer mention
- Survivor Status: No if unclear
- Survivor Medication Status: No if unclear
- Family Cancer History: No if unclear
- Family History: No if unclear
- Cancer Type: No Type Mentioned if unclear
- Weight Changes: No if timing unclear
- Post-Med Cancer: No if sequence unclear
- Risk Concerns: No if not explicit
- Sentiment: 0 if mixed/unclear
- Physician Discussion: No if not explicit
# EXAMPLE ANALYSES TO GUIDE YOUR EVALUATION

Example 1: "I was diagnosed with thyroid cancer 3 months after starting Ozempic. My doctor isn't sure if there's any connection, but I'm working with my oncologist to understand if I can continue the medication. My sister also had breast cancer, so I'm extra cautious about any risks."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "Yes",
    "is_survivor_and_taking_med": "Yes",
    "family_cancer_history": "Yes",
    "cancer_type": "Thyroid Cancer",
    "other_cancer_type": "",
    "is_survivor_weight_loss": "No",
    "cancer_diagnosis_after_medication": "Yes",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",
    "seeking_cancer_risk_data": "Yes",
    "can_assess_misinformation": "No",
    "sentiment_score": -1,
    "discussed_with_physician": "Yes",
    "discussion_GLP1_decreasing_cancer_risk": "No"
}
Reasoning: Comprehensive example showing temporal relationship with medication, family history, medical consultation, and personal experience without making broader claims.

Example 2: "Studies prove GLP1s definitely cause cancer in everyone who takes them - just look at the rat studies! Don't trust doctors who say otherwise. These drugs are killing people and Big Pharma is hiding the truth."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "No",
    "is_survivor_and_taking_med": "No",
    "family_cancer_history": "No",
    "cancer_type": "No Type Mentioned",
    "other_cancer_type": "",
    "is_survivor_weight_loss": "No",
    "cancer_diagnosis_after_medication": "No",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",
    "seeking_cancer_risk_data": "No",
    "can_assess_misinformation": "Yes",
    "sentiment_score": -1,
    "discussed_with_physician": "No",
    "discussion_GLP1_decreasing_cancer_risk": "No"
}
Reasoning: Classic misinformation example making absolute claims without evidence, over-generalizing animal studies.

Example 3: "I put on so much weigh after my history with breast cancer, but I've lost 50 pounds on Wegovy. My oncologist actually recommended it since obesity increases cancer recurrence risk. The weight loss has helped reduce my inflammation markers."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "Yes",
    "family_cancer_history": "No",
    "cancer_type": "Breast Cancer",
    "other_cancer_type": "",
    "is_survivor_and_taking_med": "Yes",
    "is_survivor_weight_loss": "Yes",
    "cancer_diagnosis_after_medication": "No",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "No",
    "seeking_cancer_risk_data": "No",
    "can_assess_misinformation": "No",
    "sentiment_score": 1,
    "discussed_with_physician": "Yes",
    "discussion_GLP1_decreasing_cancer_risk": "Yes"
}
Reasoning: Shows positive medical guidance, personal experience with specific cancer type, and therapeutic benefit without making general claims.

Example 4: "Anyone else worried about the thyroid cancer warning? My dad had thyroid cancer and I'm scared to start, but my doctor says that was papillary type and the warning is about medullary which is different."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "No",
    "is_survivor_and_taking_med": "No",
    "family_cancer_history": "Yes",
    "cancer_type": "Thyroid Cancer",
    "other_cancer_type": "",
    "is_survivor_weight_loss": "No",
    "cancer_diagnosis_after_medication": "No",
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",
    "seeking_cancer_risk_data": "Yes",
    "can_assess_misinformation": "Yes",
    "sentiment_score": -1,
    "discussed_with_physician": "Yes",
    "discussion_GLP1_decreasing_cancer_risk": "No"
}
Reasoning: Shows concern about specific cancer types, family history influence, and medical consultation for risk assessment.

Example 5: "Fighting cancer and gained weight from the treatments. Looking forward to beating this and then trying Mounjaro to get healthy again. My medical team says I need to wait until treatment is done."
{
    "inclusion": "Yes",
    "exclusion_reason": "",
    "is_survivor": "Yes",
    "is_survivor_and_taking_med": "No"
    "family_cancer_history": "No",
    "cancer_type": "No Type Mentioned",
    "other_cancer_type": "",
    "is_survivor_weight_loss": "Yes",
    "cancer_diagnosis_after_medication": "No",
    "mentions_cancer_risk": "No",
    "concerned_about_cancer_risk": "No",
    "seeking_cancer_risk_data": "No",
    "can_assess_misinformation": "No",
    "sentiment_score": 0,
    "discussed_with_physician": "Yes",
    "discussion_GLP1_decreasing_cancer_risk": "No"
}
Reasoning: Current cancer patient discussing medication timing, showing medical guidance without risk discussion.

# Extra examples to help

# EXAMPLE ANALYSES TO GUIDE YOUR EVALUATION

Example 6: "My doctor mentioned the thyroid cancer warning but said it was only in mice. I have papillary nodules that get monitored regularly, but he said that's different from the medullary thyroid cancer in the studies."
{
    "inclusion": "Yes",  # Any mention of cancer, including study findings
    "cancer_type": "Thyroid Cancer",  # Specific cancer type with histology
    "mentions_cancer_risk": "Yes",  # Discusses research findings
    "concerned_about_cancer_risk": "No",  # Informational, not expressing worry
    "discussed_with_physician": "Yes",
    "is_survivor": "No"
}
Reasoning: Shows proper inclusion for any cancer mention, even when discussing research/animal studies without personal concern.

Example 7: "Everything seems to cause cancer these days, but I'm especially worried about starting Wegovy with my family history of thyroid problems. Going to ask my doctor about screening first."
{
    "inclusion": "Yes",  # General cancer discussion plus medication context
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",  # Shows anxiety and seeking screening
    "discussed_with_physician": "No",  # Plans to discuss but hasn't yet
    "is_survivor": "No"
}
Reasoning: Captures inclusion for general cancer discussion and concerned_about_cancer_risk for screening questions.

Example 8: "After my thyroidectomy for cancer, I talked to my physician about GLP1s. Looking at research papers about cancer risk in patients with history - anyone else find good studies?"
{
    "inclusion": "Yes",  # Cancer history + research discussion
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",  # Actively seeking risk information
    "discussed_with_physician": "Yes",
    "is_survivor": "Yes"
}
Reasoning: Shows inclusion for research discussion and concern indicated by information seeking.

Example 9: "My GP discussed with me the risk of pancereatitis. In regard to cancer, animal studies show cancer risks but human data is different. Looking at population studies, obesity probably increases cancer risk more than these medications."
{
    "inclusion": "Yes",  # Discussion of cancer research findings
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "No",  # Purely informational discussion
    "discussed_with_physician": "No", # Discussed with physician but not cancer related risk discussion
    "is_survivor": "No"
}
Reasoning: Shows inclusion for research discussion without personal concern.

Example 10: "My husband had papillary cancer and takes the medication, he discussed with his physician that cancer risk is low be could occur"
{
    "inclusion": "Yes",  # Multiple cancer mentions and concerns
    "mentions_cancer_risk": "Yes",
    "concerned_about_cancer_risk": "Yes",
    "discussed_with_physician": "Yes",
    "is_survivor": "Yes"
}
Reasoning: Shows multiple cancer types, clear anxiety language indicating concern, and proper inclusion for risk discussion. """


                },
                {
                    "role": "user",
                    "content": f"Analyze this post:\n\nPost ID: {post_id}\n\nPost Body: {post_body}"
                }
            ],
            temperature=0.1,
            max_tokens=512,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "cancer_post_analysis_limited",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "post_id": {"type": "string"},
                            "inclusion": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Include if ANY cancer mention (personal/family/risk/research/screening/side effects). Only exclude if cancer is metaphorical or absent."
                            },
                            "exclusion_reason": {
                                "type": "string",
                                "description": "If excluded, specify: 'No cancer mention', 'Cancer used metaphorically', or detailed reason"
                            },
                            "is_survivor": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: past cancer treatment, remission, recovery, post-cancer experience. No if: current diagnosis, risk discussion, family history"
                            },
                            "is_survivor_and_taking_med": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: confirmed cancer survivor AND currently taking GLP1 medication. Must have evidence of both survival and current medication use. No if: missing either criterion"
                                },
                            "family_cancer_history": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions ANY family member (immediate or extended) having cancer currently or in past. No if: no family cancer mentions"
                            },
                            "cancer_type": {
                                "type": "string",
                                "enum": ["Thyroid Cancer", "Breast Cancer", "Pancreatic Cancer", "Gyn cancer", "Other", "No Type Mentioned"],
                                "description": "Select specific cancer type mentioned. This is an important variable. Use 'Other' only for clearly specified non-main types"
                            },
                            "other_cancer_type": {
                                "type": "string",
                                "description": "If 'Other' selected above, specify the exact cancer type mentioned"
                            },
                            "is_survivor_weight_loss": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: weight changes after cancer treatment/survival mentioned. No if: general weight discussion or unclear timing"
                            },
                            "cancer_diagnosis_after_medication": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: clear temporal sequence shows cancer diagnosis after starting GLP1. No if: unclear timing or just risk discussion"
                            },
                             "mentions_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: any mention of cancer risk or increased risk, includes family history concerns. Synonyms of cancer risk"
                             },
                            "concerned_about_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: expresses worry/fear about cancer risk, includes family history concerns. No if: purely informational"
                            },
                            "seeking_cancer_risk_data": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: actively asking for studies/research/experiences about cancer risk. No if: sharing info or general discussion"
                            },
                            "can_assess_misinformation": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: makes generalizable claims about medication/cancer. No if: personal experience only or questions"
                            },
                            "sentiment_score": {
                                "type": "integer",
                                "enum": [-1, 0, 1],
                                "description": "-1: fear/worry/grief, 0: factual/balanced, 1: hope/success/gratitude"
                            },
                            "discussed_risk_with_physician": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions discussing cancer risk with ANY healthcare provider or recommending such discussion. No if: no provider discussion mentioned"
                            },
                            "discussion_GLP1_decreasing_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: explicitly mentions GLP1/weight loss reducing cancer risk. No if: no mention of risk reduction"
                            }
                        },
                        "required": [
                            "post_id",
                            "inclusion",
                            "exclusion_reason",
                            "is_survivor",
                            "is_survivor_and_taking_med",
                            "family_cancer_history",
                            "cancer_type",
                            "other_cancer_type",
                            "is_survivor_weight_loss",
                            "cancer_diagnosis_after_medication",
                            "mentions_cancer_risk",
                            "concerned_about_cancer_risk",
                            "seeking_cancer_risk_data",
                            "can_assess_misinformation",
                            "sentiment_score",
                            "discussed_risk_with_physician",
                            "discussion_GLP1_decreasing_cancer_risk"
                        ],
                        "additionalProperties": False
                    }
                }
            }
        )

        print(f"Raw API response: {response.choices[0].message.content}")
        return json.loads(response.choices[0].message.content)

    except Exception as e:
        print(f"Error processing post {post_id}: {str(e)}")
        return None

def process_posts(batch_size=10):
    # Load the JSON data
    with open('prepared_dataset.json', 'r') as file:
        data = json.load(file)

    results = []

    # Process posts with progress bar
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(len(data) + batch_size - 1)//batch_size}")

        for post in batch:
            analysis = analyze_post(post['post_id'], post['body'])
            if analysis:
                analysis['original_post'] = post['body']
                results.append(analysis)

        # Save intermediate results after each batch
        with open('analysis_results.json', 'w') as file:
            json.dump(results, file, indent=2)

    print(f"\nAnalysis complete. Processed {len(results)} out of {len(data)} posts")
    print("Results saved to analysis_results.json")
    return results

if __name__ == "__main__":
    process_posts()

import pandas as pd
import json
import numpy as np

# File paths
csv_file_path = 'Adjudicated_100_dataset_clean.csv'
json_file_path = 'analysis_results.json'
merged_json_path = 'merged_analysis_results.json'

# Load the CSV file, replacing NaN with empty strings
csv_data = pd.read_csv(csv_file_path).replace(np.nan, "", regex=True)

# Load the JSON file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Add .llm suffix to keys except 'original_post' and 'post_id'
modified_json_data = []
for post in json_data:
    modified_post = {}
    for key, value in post.items():
        if key not in ['original_post', 'post_id']:
            modified_post[f"{key}.llm"] = value
        else:
            modified_post[key] = value
    modified_json_data.append(modified_post)

# Standardize text for matching
csv_data['body_cleaned'] = csv_data['body'].str.lower().str.strip()
for post in modified_json_data:
    post['original_post_cleaned'] = post['original_post'].lower().strip()

# Create a dictionary from the CSV for quick lookup
csv_lookup = csv_data.set_index('body_cleaned').to_dict(orient='index')

# Merge CSV data into JSON based on matching text
for post in modified_json_data:
    match_data = csv_lookup.get(post['original_post_cleaned'])
    if match_data:
        post.update(match_data)

# Clean up temporary fields
for post in modified_json_data:
    post.pop('original_post_cleaned', None)
csv_data = csv_data.drop(columns=['body_cleaned'])

# Save the merged data
with open(merged_json_path, 'w') as file:
    json.dump(modified_json_data, file, indent=2)

print(f"Merged data saved to {merged_json_path}")

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

def calculate_metrics(y_true, y_pred):
    """Calculate metrics for categorical variables"""
    try:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        return accuracy, precision, recall, f1
    except:
        return None, None, None, None

# Load the JSON data
with open('merged_analysis_results.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Initialize results dictionary
metrics_results = {}
mismatches = []
mismatch_counts = {}

# Get all pairs of corresponding columns (llm and human)
llm_cols = [col for col in df.columns if col.endswith('.llm')]
human_cols = [col.replace('.llm', '.human') for col in llm_cols]

# First calculate metrics for inclusion
inclusion_metrics = calculate_metrics(
    df['inclusion.human'],
    df['inclusion.llm']
)
metrics_results['inclusion'] = {
    'Accuracy': inclusion_metrics[0],
    'Precision': inclusion_metrics[1],
    'Recall': inclusion_metrics[2],
    'F1': inclusion_metrics[3]
}

# Find inclusion mismatches
inclusion_mismatches = df[df['inclusion.llm'] != df['inclusion.human']]
mismatch_counts['inclusion'] = len(inclusion_mismatches)
for _, row in inclusion_mismatches.iterrows():
    mismatches.append({
        'post_id': row['post_id'],
        'variable': 'inclusion',
        'llm_value': row['inclusion.llm'],
        'human_value': row['inclusion.human'],
        'original_text': row['original_post']
    })

# Filter for posts where inclusion.human is "Yes"
included_df = df[df['inclusion.human'] == 'Yes']

# Calculate metrics for other variables only on included posts
for llm_col, human_col in zip(llm_cols, human_cols):
    if human_col in df.columns and 'inclusion' not in llm_col:
        # Skip sentiment score as it's numerical
        if 'sentiment_score' in llm_col:
            continue

        # Get valid rows (non-empty in both columns)
        valid_mask = (included_df[llm_col].notna()) & (included_df[human_col].notna())
        y_true = included_df[human_col][valid_mask]
        y_pred = included_df[llm_col][valid_mask]

        # Calculate metrics
        accuracy, precision, recall, f1 = calculate_metrics(y_true, y_pred)

        # Store results
        base_col = llm_col.replace('.llm', '')
        metrics_results[base_col] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }

        # Find mismatches
        mismatch_mask = (included_df[llm_col] != included_df[human_col]) & valid_mask
        mismatch_df = included_df[mismatch_mask][['post_id', llm_col, human_col, 'original_post']]

        # Count mismatches for this variable
        mismatch_counts[base_col] = len(mismatch_df)

        for _, row in mismatch_df.iterrows():
            mismatches.append({
                'post_id': row['post_id'],
                'variable': base_col,
                'llm_value': row[llm_col],
                'human_value': row[human_col],
                'original_text': row['original_post']
            })

# Create results DataFrame
metrics_df = pd.DataFrame(metrics_results).T
metrics_df['Mismatches'] = pd.Series(mismatch_counts)
metrics_df = metrics_df.round(3)

# Create mismatches DataFrame
mismatches_df = pd.DataFrame(mismatches)

# Save results
metrics_df.to_csv('metrics_summary.csv')
mismatches_df.to_csv('mismatches.csv', index=False)

# Print summary
print("\nMetrics Summary:")
print(metrics_df)

print("\nMismatch Counts per Variable:")
for var, count in mismatch_counts.items():
    print(f"{var}: {count} mismatches")

print("\nTotal number of mismatches:", len(mismatches))
print("Included posts:", len(included_df))
print("Total posts:", len(df))


Processing batch 1 of 10
Analyzing post_id: 0
body (first 100 chars): My husband had papillary thyroid cancer and he takes Wegovy....
Raw API response: {"post_id":"0","inclusion":"Yes","exclusion_reason":"","is_survivor":"Yes","is_survivor_and_taking_med":"Yes","family_cancer_history":"No","cancer_type":"Thyroid Cancer","other_cancer_type":"","is_survivor_weight_loss":"No","cancer_diagnosis_after_medication":"No","mentions_cancer_risk":"No","concerned_about_cancer_risk":"No","seeking_cancer_risk_data":"No","can_assess_misinformation":"No","sentiment_score":0,"discussed_risk_with_physician":"No","discussion_GLP1_decreasing_cancer_risk":"No"}
Analyzing post_id: 1
body (first 100 chars): How many drugs have noted dangerous side effects. From giving you cancer to heart attacks. I am pret...
Raw API response: {"post_id":"1","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"No","cancer_type":"No Type Mentioned","other_cance

# **Final Code**

---



In [None]:
import json
import pandas as pd
import numpy as np
from datetime import datetime
import os
from tqdm.notebook import tqdm

def run_multiple_analyses(num_runs=5):
    """
    Run the analysis multiple times and store results
    """
    # Create directory for this batch of runs
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    batch_dir = f'analysis_batch_{timestamp}'
    os.makedirs(batch_dir, exist_ok=True)

    all_results = []
    all_metrics = []

    # Run the analysis multiple times
    for run in range(num_runs):
        print(f"\nStarting Run {run + 1}/{num_runs}")

        # Run the original analysis
        results = process_posts(batch_size=10)

        # Save this run's results
        run_file = f'{batch_dir}/run_{run + 1}_results.json'
        with open(run_file, 'w') as f:
            json.dump(results, f, indent=2)

        # Calculate metrics for this run
        metrics_df = calculate_metrics_for_run(results)
        metrics_df['run'] = run + 1
        all_metrics.append(metrics_df)
        all_results.append(results)

        print(f"Run {run + 1} complete")

    return compare_runs(all_results, all_metrics, batch_dir)

def calculate_metrics_for_run(results):
    """
    Calculate metrics for a single run
    """
    # Convert results to DataFrame
    df = pd.DataFrame(results)

    metrics_dict = {}

    # Calculate metrics for each variable
    for col in df.columns:
        if col.endswith('.llm'):
            base_col = col.replace('.llm', '')
            human_col = f"{base_col}.human1"

            if human_col in df.columns:
                # Skip sentiment score as it's numerical
                if 'sentiment_score' in col:
                    continue

                # Calculate agreement percentage
                agreement = (df[col] == df[human_col]).mean()
                metrics_dict[base_col] = agreement

    return pd.DataFrame([metrics_dict])

def compare_runs(all_results, all_metrics, batch_dir):
    """
    Compare results across different runs
    """
    # Combine all metrics
    combined_metrics = pd.concat(all_metrics, ignore_index=True)

    # Calculate statistics for each variable
    stats = combined_metrics.agg(['mean', 'std', 'min', 'max'])

    # Calculate agreement between runs
    run_agreement = {}
    for col in combined_metrics.columns:
        if col != 'run':
            values = combined_metrics[col].values
            agreement = np.mean([values[i] == values[j]
                               for i in range(len(values))
                               for j in range(i+1, len(values))])
            run_agreement[col] = agreement

    # Save comparison results
    stats.to_csv(f'{batch_dir}/metric_statistics.csv')

    # Create detailed report
    report = {
        'metric_statistics': stats.to_dict(),
        'run_agreement': run_agreement,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'number_of_runs': len(all_metrics)
    }

    with open(f'{batch_dir}/comparison_report.json', 'w') as f:
        json.dump(report, f, indent=2)

    # Print summary
    print("\nAnalysis Complete!")
    print("\nMetric Statistics:")
    print(stats)
    print("\nRun Agreement:")
    for var, agreement in run_agreement.items():
        print(f"{var}: {agreement:.3f}")

    return report

# Run the analysis
if __name__ == "__main__":
    report = run_multiple_analyses(num_runs=5)


Starting Run 1/5

Processing batch 1 of 10
Analyzing post_id: 0
body (first 100 chars): My husband had papillary thyroid cancer and he takes Wegovy....
Raw API response: {"post_id":"0","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"Yes","cancer_type":"Thyroid Cancer","other_cancer_type":"","is_survivor_weight_loss":"No","cancer_diagnosis_after_medication":"No","mentions_cancer_risk":"No","concerned_about_cancer_risk":"No","seeking_cancer_risk_data":"No","can_assess_misinformation":"No","sentiment_score":0,"discussed_risk_with_physician":"No","discussion_GLP1_decreasing_cancer_risk":"No"}
Analyzing post_id: 1
body (first 100 chars): How many drugs have noted dangerous side effects. From giving you cancer to heart attacks. I am pret...
Raw API response: {"post_id":"1","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"No","cancer_type":"No Type Mentio

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json
import re

def calculate_metrics(y_true, y_pred):
    """Calculate metrics for categorical variables"""
    try:
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        return accuracy, precision, recall, f1
    except:
        return None, None, None, None

# File paths
csv_file_path = 'Adjudicated_100_dataset_clean.csv'  # Human-labeled data

# List of run JSON files
run_json_files = [
    'run_1_results.json',
    'run_2_results.json',
    'run_3_results.json',
    'run_4_results.json',
    'run_5_results.json'
]

# Initialize a list to store metrics for all runs
all_runs_metrics = []
all_runs_mismatches = []

# Load the CSV file, replacing NaN with empty strings
csv_data = pd.read_csv(csv_file_path).replace(np.nan, "", regex=True)

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply the cleaning function to 'body' in CSV data
csv_data['body_cleaned'] = csv_data['body'].apply(clean_text)

# Create a lookup dictionary from the cleaned 'body' text
csv_lookup = csv_data.set_index('body_cleaned').to_dict(orient='index')

# Loop over each run
for run_idx, json_file_path in enumerate(run_json_files, start=1):
    print(f"\nProcessing run {run_idx}")
    # Load the JSON file for the current run
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)

    # Add .llm suffix to keys except 'original_post' and 'post_id'
    modified_json_data = []
    for post in json_data:
        modified_post = {}
        for key, value in post.items():
            if key not in ['original_post', 'post_id']:
                modified_post[f"{key}.llm"] = value
            else:
                modified_post[key] = value
        modified_json_data.append(modified_post)

    # Convert to DataFrame
    df_llm = pd.DataFrame(modified_json_data)

    # Clean 'original_post' in LLM data for matching
    df_llm['original_post_cleaned'] = df_llm['original_post'].apply(clean_text)

    # Merge LLM DataFrame with human-labeled CSV data based on the cleaned text
    df = df_llm.merge(csv_data, left_on='original_post_cleaned', right_on='body_cleaned', how='inner')

    # Check if any matches were found
    if df.empty:
        print(f"No matches found in run {run_idx}.")
        continue  # Skip to the next run if no matches
    else:
        print(f"Number of matched posts in run {run_idx}: {len(df)}")

    # Initialize results dictionary for this run
    metrics_results = {}
    mismatches = []
    mismatch_counts = {}

    # Get all pairs of corresponding columns (llm and human)
    llm_cols = [col for col in df.columns if col.endswith('.llm')]
    human_cols = [col for col in df.columns if col.endswith('.human')]

    # Map LLM columns to corresponding human columns
    variable_pairs = []
    for llm_col in llm_cols:
        base_var = llm_col.replace('.llm', '')
        human_col = f"{base_var}.human"
        if human_col in df.columns:
            variable_pairs.append((llm_col, human_col))
        else:
            print(f"Human column {human_col} not found in run {run_idx}")

    # First calculate metrics for inclusion
    if ('inclusion.llm', 'inclusion.human') in variable_pairs:
        llm_col, human_col = 'inclusion.llm', 'inclusion.human'
        inclusion_metrics = calculate_metrics(
            df[human_col],
            df[llm_col]
        )
        metrics_results['inclusion'] = {
            'Accuracy': inclusion_metrics[0],
            'Precision': inclusion_metrics[1],
            'Recall': inclusion_metrics[2],
            'F1': inclusion_metrics[3]
        }

        # Find inclusion mismatches
        inclusion_mismatches = df[df[llm_col] != df[human_col]]
        mismatch_counts['inclusion'] = len(inclusion_mismatches)
        for _, row in inclusion_mismatches.iterrows():
            mismatches.append({
                'post_id': row.get('post_id', ''),
                'variable': 'inclusion',
                'llm_value': row[llm_col],
                'human_value': row[human_col],
                'original_text': row['original_post'],
                'run': run_idx
            })
    else:
        print(f"'inclusion' column missing in run {run_idx}")
        # Proceed without calculating inclusion metrics
        included_df = df
        print(f"Proceeding with all posts in run {run_idx}")

    # Filter for posts where inclusion.human is "Yes"
    if 'inclusion.human' in df.columns:
        included_df = df[df['inclusion.human'] == 'Yes']
    else:
        included_df = df  # Use all posts if 'inclusion.human' is missing
        print(f"'inclusion.human' not found in run {run_idx}, using all posts.")

    # Calculate metrics for other variables only on included posts
    for llm_col, human_col in variable_pairs:
        if llm_col == 'inclusion.llm':
            continue  # Already processed inclusion

        # Skip sentiment score as it's numerical
        if 'sentiment_score' in llm_col:
            continue

        # Get valid rows (non-empty in both columns)
        valid_mask = (included_df[llm_col].notna()) & (included_df[human_col].notna())
        y_true = included_df[human_col][valid_mask]
        y_pred = included_df[llm_col][valid_mask]

        if len(y_true) == 0:
            continue  # Skip if no valid data

        # Calculate metrics
        accuracy, precision, recall, f1 = calculate_metrics(y_true, y_pred)

        # Store results
        base_col = llm_col.replace('.llm', '')
        metrics_results[base_col] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }

        # Find mismatches
        mismatch_mask = (included_df[llm_col] != included_df[human_col]) & valid_mask
        mismatch_df = included_df[mismatch_mask][['post_id', llm_col, human_col, 'original_post']]

        # Count mismatches for this variable
        mismatch_counts[base_col] = len(mismatch_df)

        for _, row in mismatch_df.iterrows():
            mismatches.append({
                'post_id': row.get('post_id', ''),
                'variable': base_col,
                'llm_value': row[llm_col],
                'human_value': row[human_col],
                'original_text': row['original_post'],
                'run': run_idx
            })

    # Append metrics and mismatches for this run
    if metrics_results:
        run_metrics_df = pd.DataFrame(metrics_results).T
        run_metrics_df['Run'] = run_idx
        all_runs_metrics.append(run_metrics_df)
        all_runs_mismatches.extend(mismatches)

        # Optionally, save individual run results
        run_metrics_df.to_csv(f'metrics_summary_run_{run_idx}.csv')
        mismatches_df = pd.DataFrame(mismatches)
        mismatches_df.to_csv(f'mismatches_run_{run_idx}.csv', index=False)

        print(f"Processed run {run_idx}")
    else:
        print(f"No metrics calculated for run {run_idx}")

# Check if any metrics were collected
if all_runs_metrics:
    # Combine metrics from all runs
    combined_metrics_df = pd.concat(all_runs_metrics).reset_index().rename(columns={'index': 'Variable'})

    # Calculate ranges for each variable
    metrics_summary = combined_metrics_df.groupby('Variable').agg({
        'Accuracy': ['min', 'max', 'mean', 'std'],
        'Precision': ['min', 'max', 'mean', 'std'],
        'Recall': ['min', 'max', 'mean', 'std'],
        'F1': ['min', 'max', 'mean', 'std']
    }).round(3)

    # Save combined metrics summary
    metrics_summary.to_csv('metrics_summary_all_runs.csv')

    # Save all mismatches
    all_mismatches_df = pd.DataFrame(all_runs_mismatches)
    all_mismatches_df.to_csv('mismatches_all_runs.csv', index=False)

    # Print summary
    print("\nMetrics Summary Across All Runs:")
    print(metrics_summary)

    print("\nTotal number of mismatches across all runs:", len(all_mismatches_df))
else:
    print("\nNo metrics were calculated across all runs.")



Processing run 1
Number of matched posts in run 1: 100
Human column family_cancer_history.human not found in run 1
Processed run 1

Processing run 2
Number of matched posts in run 2: 100
Human column family_cancer_history.human not found in run 2
Processed run 2

Processing run 3
Number of matched posts in run 3: 100
Human column family_cancer_history.human not found in run 3
Processed run 3

Processing run 4
Number of matched posts in run 4: 100
Human column family_cancer_history.human not found in run 4
Processed run 4

Processing run 5
Number of matched posts in run 5: 100
Human column family_cancer_history.human not found in run 5
Processed run 5

Metrics Summary Across All Runs:
                                       Accuracy                       \
                                            min    max   mean    std   
Variable                                                               
can_assess_misinformation                 0.615  0.656  0.638  0.019   
cancer_diagnosis_a

In [None]:
!pip install openai tiktoken

import json
import os
from google.colab import auth, userdata, files
from google.auth import default
from openai import OpenAI
import tiktoken

# OpenAI setup
client = OpenAI(api_key=userdata.get('OPEN_AI'))

# Function to count tokens
def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-4o-mini-2024-07-18")
    return len(encoding.encode(text))

In [None]:
# step1_prepare_data.py
import pandas as pd
import json

def load_and_prepare_data():
    # Read the CSV file
    try:
        df = pd.read_csv('processed_data.csv')
        print(f"Successfully loaded CSV with {len(df)} rows")

        # Convert to list of dictionaries
        dataset = []
        for index, row in df.iterrows():
            post_dict = {
                'post_id': str(index),
                'body': row['body']
            }
            dataset.append(post_dict)

        # Save to JSON for inspection
        with open('prepared_dataset.json', 'w') as f:
            json.dump(dataset, f, indent=2)

        print(f"Saved {len(dataset)} posts to prepared_dataset.json")
        return dataset

    except Exception as e:
        print(f"Error preparing data: {str(e)}")
        return None

if __name__ == "__main__":
    load_and_prepare_data()

Successfully loaded CSV with 2058 rows
Saved 2058 posts to prepared_dataset.json


In [None]:
import json
from openai import OpenAI
from tqdm.notebook import tqdm

def analyze_post(post_id, post_body):
    print(f"Analyzing post_id: {post_id}")
    print(f"body (first 100 chars): {post_body[:100]}...")

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[
               {
                    "role": "system",
                    "content": """You are analyzing Reddit posts about GLP1 medications and cancer discussions. Follow these detailed guidelines for each variable:

INCLUSION CRITERIA:
- Include ANY mention of cancer (personal, family, risk, research, screening, side effects)
- Include even general cancer discussions or animal studies
- Only exclude if cancer is purely metaphorical or completely absent
- Default to inclusion if uncertain
- If a post is a link, exclude it
Example YES: "There is a risk of cancer, be aware", "everything around us increases our risk for cancer, talk to a physician", "My doctor told me ozempic does not cause cancer"
Example NO: "Congrats, cancer queen", "Best wishes for getting treating for cancer"

SURVIVOR IDENTIFICATION:
- Look for past tense cancer references
- Consider treatment completion indicators
- Check for remission/recovery mentions
- Default to NO unless clear evidence
- If the post hints at a surgery that happened secondary to cancer, include it for example "thyroidectomy due to cancer"
- If the post hints at a cancer history include it for example "Cancer, Diabetes here and im trying to lose weight"
Example YES: "after my cancer treatment", "been cancer-free"
Example NO: "worried about cancer risk", "family history"

SURVIVOR MEDICATION STATUS:
- Check both survivor status AND current medication use
- Must confirm both survival and active medication use
- Consider indirect mentions of current medication use
- Default to NO if either criterion unclear
Example YES: "I'm a cancer survivor now taking Wegovy", "After beating cancer, I started Ozempic", "taking it after going through cancer treatment", "started the meds after cancer", "they put me on it after the weight gain due to my treatment for cancer"
Example NO: "cancer survivor but haven't started yet", "taking meds but no cancer history",

FAMILY CANCER HISTORY:
- Look for mentions of cancer in family members
- Consider all types of family references (parents, siblings, aunts/uncles, etc.)
- Include both current and past family cancer cases
- Default to NO if unclear
Example YES: "cancer runs in my family", "my mom had cancer", "sister's breast cancer"
Example NO: "general cancer discussion", "friend had cancer" "spouse has cancer"

CANCER TYPE CATEGORIZATION:
- Choose first specific cancer mentioned
- Map clinical terms to main categories
- Only use "Other" for clearly specified non-main types
- Use "No Type Mentioned" for general discussions
- If risk mentioned that is associated with a cancer then select the cancer type, Example: "risk of breast cancer" -> "Breast Cancer"
Example mapping examples:
- Thyroid: thyroid carcinoma, thyroid tumors, "could lead to thyroid cancer", medullary thyroid, papillary thyroid
- Breast: mammary carcinoma
- Gyn: endometrial, ovarian, cervical
- Other: Blood cancer, anal cancer

OTHER CANCER TYPE:
- for if cancer_type is other then map this to a disease site instead of the actual histology
Example: "Osteosarcoma -> bone cancer", "leukemia -> blood cancer", "colon cancer -> GI cancer"

SURVIVOR WEIGHT PATTERNS:
- Look for temporal relationship (cancer then weight change)
- If trying to lose weight after a cancer diagnosis, include it
- If they had cancer in the past and now losing weight, include it
- Consider treatment-related changes
- Default to NO if timing unclear
Example YES: "losing treatment weight", "now trying to lose all this weight after cancer treatment"
Example NO: "general weight issues", "family member's weight"

POST-MEDICATION CANCER:
- Require clear temporal sequence
- Look for specific timeframes
- Consider diagnostic language
- Default to NO if timing unclear
Example YES: "diagnosed 6 months after starting", "developed while on"
Example NO: "had cancer before", "worried about risk"

MENTIONS OF CANCER RISK:
- Include if post has any mention of cancer risk with GLP1 meds
- Look for words synonmous with risk relating to cancer or malignancy
- Include synonyms for risk (chance, possibility, likelihood, probability)
- Include monitoring for potential development
- Include of discussion is not in humans like rats
- Default to NO if unclear
Examples YES: "risk of developing cancer", "chance of malignancy"

RISK CONCERNS:
- Include both direct and indirect worry
- Consider family history context
- Look for anxiety/fear language
- Default to NO if purely informational or not seeking information or communicating risk of cancer
- If asking for cancer history being excluding then they are concerned
Example YES: "scared of cancer risk", "worried about family history"
Example NO: "sharing research", "general discussion", "Cancer from ozempic is unproven"

SEEKING INFORMATION RELATED TO CANCER RISKS:
- Include if the post mentions more information related to cancer risk
- Example Yes: "Does anyone know if the risk is significant?", "I was told the only risk is in rats"
- Example No: "sharing research", "general discussion", "Cancer from ozempic is unproven"

SENTIMENT SCORING:
-1: Fear, worry, anger, grief
 0: Facts, balanced views, mixed feelings
+1: Hope, success, gratitude, support
Example -1: "terrified of cancer risk"
Example 0: "discussing research findings"
Example +1: "grateful for successful treatment"

PHYSICIAN DISCUSSION:
- Include any discusion with healthcare provider related to cancer risk and treatment
- Consider both direct and recommended discussions
- Look for clinical assessment mentions
- Default to NO if unclear, or just mentions talking to a physician without cancer risk discussion (can be implied too)
- If the post is recommending someone speaks with their physician, include it
Example YES: "doctor discussed risks", "oncologist advised"
Example NO: "should ask doctor", "general medical info"

DISCUSSION OF DECREASED RISK:
- If a post hints that due to glp1 medication there is a DECREASED risk of cancer, include it
- If a post hints that glp1 decreases weight, weight decreases cancer risk, include it
- If a post hints that glp1 increases cancer risk, exclude it
- If no clear logical link between Decrease of cancer risk and the glp1 medication or weight loss, exclude it
- If mentions decreased recurrence rate, include it


When uncertain between options, use these defaults:
- Inclusion: Include if any cancer mention
- Survivor Status: No if unclear
- Survivor Medication Status: No if unclear
- Family Cancer History: No if unclear
- Family History: No if unclear
- Cancer Type: No Type Mentioned if unclear
- Weight Changes: No if timing unclear
- Post-Med Cancer: No if sequence unclear
- Risk Concerns: No if not explicit
- Sentiment: 0 if mixed/unclear
- Physician Discussion: No if not explicit

# EXAMPLE AND EDGE CASES TO GUIDE YOUR EVALUATION

        Example Post 1: "I read somewhere that Ozempic definitely causes thyroid cancer in everyone who takes it. Don't let big pharma fool you."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "Yes",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: Makes absolute claims without evidence about cancer causation. Valid cancer mention but spreads misinformation. Not expressing personal concern.

        Example Post 2: "My doctors told me the thyroid cancer risk was only seen in rats and based on careful discussion of my personal medical history, we decided the benefits outweighed the potential risks."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "No",
            "discussed_risk_with_physician": "Yes"
        }
        Reasoning: Personal medical discussion with evidence of physician consultation about cancer risks. Shows consideration of personal risk without making general claims.

        Example Post 3: "Started Mounjaro and lost 50 pounds! I feel like I'm beating cancer before it starts."
        {
            "inclusion": "No",
            "exclusion_reason": "Cancer used metaphorically",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "No"
        }
        Reasoning: Cancer mention is purely metaphorical, not discussing actual cancer risk or experience. Personal experience only.

        Example Post 4: "Just sharing research findings: our hospital database shows zero cancer cases from GLP1s, proving they're completely safe."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "Yes",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: Makes broad safety claims and generalizations beyond the data. Not expressing personal concern but making definitive claims about safety.

        Example Post 5: "My aunt got cancer six months after starting Ozempic. Anyone else experience this? I'm terrified to start."
        {
            "inclusion": "Yes",
            "exclusion_reason": "",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "family_cancer_history": "Yes",
            "cancer_diagnosis_after_medication": "Yes"
        }
        Reasoning: Clear temporal relationship between medication and cancer diagnosis in family member. Shows personal concern and seeking information.

        Example Post 6: "Looking at cancer rates in the general population vs GLP1 users according to latest studies [link]"
        {
            "inclusion": "Yes",
            "exclusion_reason": "General cancer discussion without specific connection to medication effects",
            "can_assess_misinformation": "No",
            "concerned_about_cancer_risk": "No",
            "seeking_cancer_risk_data": "No"
        }
        Reasoning: General discussion of statistical information without personal context or claims. Sharing research without assessment.

        Example Post 7: "I filled my prescription but I'm kind of freaked out about the pancreatitis/gall bladder, and thyroid cancer warnings. Are these things I would need to worry about for the rest of my life?!"
        {
            "cancer_type": "Thyroid Cancer",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "sentiment_score": -1
        }
        Reasoning: Explicitly mentions thyroid cancer warning and shows anxiety about long-term risks. Asking for information about duration of risk.

        Example Post 8: "Man you look awesome. Congrats to losing all that weight and beating cancer"
        {
            "is_survivor": "Yes",
            "cancer_type": "No Type Mentioned",
            "sentiment_score": 1,
            "is_survivor_weight_loss": "Yes"
        }
        Reasoning: Clear reference to having beaten cancer (past tense), with positive sentiment about weight loss post-cancer.

        Example Post 9: "I've had consistent diarrhea for years after having colon cancer and a couple of bowel surgeries... Anyway, the problem with Ozempic diarrhea, for me, is the surprise factor."
        {
            "is_survivor": "Yes",
            "is_survivor_and_taking_med": "Yes",
            "cancer_type": "Other",
            "other_cancer_type": "GI cancer",
            "sentiment_score": 0
        }
        Reasoning: Clearly indicates past colon cancer and current Ozempic use, providing detailed personal experience.

        Example Post 10: "Breast cancer risk is common, unfortunately. But even here, being obese increases your risk more than the pill... I try not to stress about the ones I have no control over and focus on what I realistically can impact."
        {
            "cancer_type": "Breast Cancer",
            "is_survivor_weight_loss": "Yes",
            "concerned_about_cancer_risk": "No",
            "can_assess_misinformation": "No",
            "sentiment_score": 0
        }
        Reasoning: Educational discussion about risk factors, not expressing personal concern but sharing factual information about risk management.

        Example Post 11: "And for those people who told you they got cancer 'shortly after' starting Ozempic, what does 'shortly after' means?"
        {
            "cancer_diagnosis_after_medication": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "can_assess_misinformation": "No",
            "sentiment_score": 0
        }
        Reasoning: Discussing temporal relationship between medication and cancer diagnosis, seeking clarification about timing.

        Example Post 12: "That's wonderful, my hystersister! I had a full hysterectomy with bilateral saplingo-oopherectomy for my 1A endometrial cancer... your weight loss will actually help lower your estrogen-related cancer risk!"
        {
            "is_survivor": "Yes",
            "cancer_type": "Gyn Cancer",
            "discussion_GLP1_decreasing_cancer_risk": "Yes",
            "sentiment_score": 1,
            "concerned_about_cancer_risk": "No"
        }
        Reasoning: Past gynecological cancer mentioned, discussing how weight loss reduces cancer risk, positive sentiment.

        Example Post 13: "So a father having papillary cancer wouldn't prohibit me from being on a GPL1?"
        {
            "family_cancer_history": "Yes",
            "cancer_type": "Thyroid Cancer",
            "concerned_about_cancer_risk": "Yes",
            "seeking_cancer_risk_data": "Yes",
            "sentiment_score": -1
        }
        Reasoning: Mentions family history of papillary thyroid cancer, seeking information about medication safety in this context, showing concern.
        Example Post 14: "GLP1s definitely cause cancer in everyone who takes them, just like they did in the rat studies. This is proven fact."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Makes absolute claims about causation without evidence, extends animal studies to humans inappropriately.

        Example Post 15: "When I took Ozempic, I didn't get cancer. My doctor said the risk was low for me."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Personal experience only, not making generalizable claims. Referencing medical advice.

        Example Post 16: "According to recent studies, thyroid cancer risk from GLP1s is completely nonexistent in humans since we lack the specific receptors found in rats."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Makes definitive claims about biological mechanisms and risk, going beyond available evidence.

        Example Post 17: "Got diagnosed with pancreatic cancer 3 months after starting. Doctor isn't sure if there's any connection."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Pancreatic Cancer"
        }
        Reasoning: Sharing personal experience without making broader claims about causation.

        Example Post 18: "The scientific consensus is clear - these drugs have zero cancer risk and anyone who says otherwise is spreading fear."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "No Type Mentioned"
        }
        Reasoning: Makes sweeping claims about scientific consensus and risk, dismissing valid concerns.

        Example Post 19: "My endocrinologist monitors my thyroid closely because of the medication's warning label, but she said medullary thyroid cancer risk is only in rats."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Reporting medical professional's guidance without making generalizable claims.

        Example Post 20: "These medications actually prevent cancer by reducing obesity. The thyroid warnings are just legal protection."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Thyroid Cancer"
        }
        Reasoning: Makes unsupported claims about cancer prevention and dismisses established warnings.

        Example Post 21: "I read the research papers on breast cancer risk. Still discussing with my doctor whether it applies to my situation."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Breast Cancer"
        }
        Reasoning: Discussing personal risk assessment without making broader claims.

        Example Post 22: "Real talk - colon cancer runs in my family, but my doctor says obesity is a bigger risk factor than GLP1s."
        {
            "can_assess_misinformation": "No",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Other",
            "other_cancer_type": "GI cancer"
        }
        Reasoning: Sharing medical guidance and personal context without generalizing.

        Example Post 23: "All studies prove that endometrial cancer risk goes down with weight loss from these meds. It's 100% safe for everyone."
        {
            "can_assess_misinformation": "Yes",
            "mentions_cancer_risk": "Yes",
            "cancer_type": "Gyn Cancer"
        }
        Reasoning: Overgeneralizes research findings and makes absolute safety claims.
        """


                },
                {
                    "role": "user",
                    "content": f"Analyze this post:\n\nPost ID: {post_id}\n\nPost Body: {post_body}"
                }
            ],
            temperature=0.0,
            max_tokens=256,
            top_p=0.1,
            frequency_penalty=0,
            presence_penalty=0,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "cancer_post_analysis_limited",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "post_id": {"type": "string"},
                            "inclusion": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Include if ANY cancer mention (personal/family/risk/research/screening/side effects). Only exclude if cancer is metaphorical or absent."
                            },
                            "exclusion_reason": {
                                "type": "string",
                                "description": "If excluded, specify: 'No cancer mention', 'Cancer used metaphorically', or detailed reason"
                            },
                            "is_survivor": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: past cancer treatment, remission, recovery, post-cancer experience. No if: current diagnosis, risk discussion, family history"
                            },
                            "is_survivor_and_taking_med": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: confirmed cancer survivor AND currently taking GLP1 medication. Must have evidence of both survival and current medication use. No if: missing either criterion"
                                },
                            "family_cancer_history": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions ANY family member (immediate or extended) having cancer currently or in past. No if: no family cancer mentions"
                            },
                            "cancer_type": {
                                "type": "string",
                                "enum": ["Thyroid Cancer", "Breast Cancer", "Pancreatic Cancer", "Gyn cancer", "Other", "No Type Mentioned"],
                                "description": "Select first specific cancer type mentioned. Use 'Other' only for clearly specified non-main types"
                            },
                            "other_cancer_type": {
                                "type": "string",
                                "description": "If 'Other' selected above, specify the exact cancer type mentioned"
                            },
                            "is_survivor_weight_loss": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: weight changes after cancer treatment/survival mentioned. No if: general weight discussion or unclear timing"
                            },
                            "cancer_diagnosis_after_medication": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: clear temporal sequence shows cancer diagnosis after starting GLP1. No if: unclear timing or just risk discussion"
                            },
                             "mentions_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: any mention of cancer risk or increased risk, includes family history concerns. Synonyms of cancer risk"
                             },
                            "concerned_about_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: expresses worry/fear about cancer risk, includes family history concerns. No if: purely informational"
                            },
                            "seeking_cancer_risk_data": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: actively asking for studies/research/experiences about cancer risk. No if: sharing info or general discussion"
                            },
                            "can_assess_misinformation": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: makes generalizable claims about medication/cancer. No if: personal experience only or questions"
                            },
                            "sentiment_score": {
                                "type": "integer",
                                "enum": [-1, 0, 1],
                                "description": "-1: fear/worry/grief, 0: factual/balanced, 1: hope/success/gratitude"
                            },
                            "discussed_risk_with_physician": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: mentions discussing cancer risk with ANY healthcare provider or recommending such discussion. No if: no provider discussion mentioned"
                            },
                            "discussion_GLP1_decreasing_cancer_risk": {
                                "type": "string",
                                "enum": ["Yes", "No"],
                                "description": "Yes if: explicitly mentions GLP1/weight loss reducing cancer risk. No if: no mention of risk reduction"
                            }
                        },
                        "required": [
                            "post_id",
                            "inclusion",
                            "exclusion_reason",
                            "is_survivor",
                            "is_survivor_and_taking_med",
                            "family_cancer_history",
                            "cancer_type",
                            "other_cancer_type",
                            "is_survivor_weight_loss",
                            "cancer_diagnosis_after_medication",
                            "mentions_cancer_risk",
                            "concerned_about_cancer_risk",
                            "seeking_cancer_risk_data",
                            "can_assess_misinformation",
                            "sentiment_score",
                            "discussed_risk_with_physician",
                            "discussion_GLP1_decreasing_cancer_risk"
                        ],
                        "additionalProperties": False
                    }
                }
            }
        )

        print(f"Raw API response: {response.choices[0].message.content}")
        return json.loads(response.choices[0].message.content)

    except Exception as e:
        print(f"Error processing post {post_id}: {str(e)}")
        return None

def process_posts(batch_size=10):
    # Load the JSON data
    with open('prepared_dataset.json', 'r') as file:
        data = json.load(file)

    results = []

    # Process posts with progress bar
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(len(data) + batch_size - 1)//batch_size}")

        for post in batch:
            analysis = analyze_post(post['post_id'], post['body'])
            if analysis:
                analysis['original_post'] = post['body']
                results.append(analysis)

        # Save intermediate results after each batch
        with open('analysis_results.json', 'w') as file:
            json.dump(results, file, indent=2)

    print(f"\nAnalysis complete. Processed {len(results)} out of {len(data)} posts")
    print("Results saved to analysis_results.json")
    return results

if __name__ == "__main__":
    process_posts()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
body (first 100 chars): Honestly, as someone with massive amounts of experience here (chronic severe illness, terminal since...
Raw API response: {"post_id":"615","inclusion":"Yes","exclusion_reason":"","is_survivor":"No","is_survivor_and_taking_med":"No","family_cancer_history":"No","cancer_type":"Thyroid Cancer","other_cancer_type":"","is_survivor_weight_loss":"No","cancer_diagnosis_after_medication":"No","mentions_cancer_risk":"Yes","concerned_about_cancer_risk":"No","seeking_cancer_risk_data":"No","can_assess_misinformation":"No","sentiment_score":0,"discussed_risk_with_physician":"Yes","discussion_GLP1_decreasing_cancer_risk":"No"}
Analyzing post_id: 616
body (first 100 chars): Thanks for sharing this. I’m an alcoholic too—and relieved that Ozempic/Wegovy has made me more in c...
Raw API response: {"post_id":"616","inclusion":"No","exclusion_reason":"No cancer mention","is_survivor":"No","is_survivor_and_taking_med":

In [None]:
import json
import pandas as pd
import numpy as np

# List of runs
run_files = [
    'run_1_results.json',
    'run_2_results.json',
    'run_3_results.json',
    'run_4_results.json',
    'run_5_results.json'
]

# Fields to evaluate (excluding 'post_id' and 'original_post')
fields = [
    "inclusion",
    "exclusion_reason",
    "is_survivor",
    "is_survivor_and_taking_med",
    "family_cancer_history",
    "cancer_type",
    "other_cancer_type",
    "is_survivor_weight_loss",
    "cancer_diagnosis_after_medication",
    "mentions_cancer_risk",
    "concerned_about_cancer_risk",
    "seeking_cancer_risk_data",
    "can_assess_misinformation",
    "sentiment_score",
    "discussed_risk_with_physician",
    "discussion_GLP1_decreasing_cancer_risk"
]

# Load data from each run into a DataFrame
data_frames = []
for idx, file in enumerate(run_files):
    with open(file, 'r') as f:
        data = json.load(f)
        # Convert list of dicts to DataFrame
        df = pd.DataFrame(data)
        # Rename columns to include run number
        df = df[['post_id'] + fields]
        df = df.set_index('post_id')
        df.columns = [f"{col}_run{idx+1}" for col in df.columns]
        data_frames.append(df)

# Merge all runs on 'post_id'
merged_df = pd.concat(data_frames, axis=1)

# Function to compute full agreement rates
def compute_full_agreement_rates(df, fields, num_runs):
    full_agreement_results = {}
    for field in fields:
        # Get columns for the field across all runs
        cols = [f"{field}_run{run}" for run in range(1, num_runs+1)]
        # Check if all values in the row are the same across runs
        df['all_agree'] = df[cols].nunique(axis=1) == 1
        # Compute the full agreement rate
        full_agreement_rate = df['all_agree'].mean() * 100  # Convert to percentage
        full_agreement_results[field] = full_agreement_rate
        # Drop the 'all_agree' column to avoid interference in the next iteration
        df.drop(columns='all_agree', inplace=True)
    return full_agreement_results

# Compute full agreement rates
full_agreement_rates = compute_full_agreement_rates(merged_df, fields, num_runs=len(run_files))

# Display the results
print("Full Agreement Rates (%):\n")
for field, rate in full_agreement_rates.items():
    print(f"Field: {field}, Full Agreement Rate: {rate:.2f}%")

# Compute overall full agreement rate across all variables
# This calculates the percentage of instances where all runs agree on all fields simultaneously
def compute_overall_full_agreement(df, fields, num_runs):
    cols = []
    for field in fields:
        field_cols = [f"{field}_run{run}" for run in range(1, num_runs+1)]
        cols.extend(field_cols)
    # Group the columns by run
    run_cols = []
    for run in range(1, num_runs+1):
        run_field_cols = [f"{field}_run{run}" for field in fields]
        run_cols.append(run_field_cols)
    # For each post, check if all runs have identical values across all fields
    df['all_fields_agree'] = df.apply(lambda row: all(
        row[run_cols[0]].values.tolist() == row[run_cols[i]].values.tolist()
        for i in range(1, num_runs)
    ), axis=1)
    overall_full_agreement_rate = df['all_fields_agree'].mean() * 100
    return overall_full_agreement_rate

overall_full_agreement_rate = compute_overall_full_agreement(merged_df, fields, num_runs=len(run_files))

print(f"\nOverall Full Agreement Rate across all variables: {overall_full_agreement_rate:.2f}%")


Full Agreement Rates (%):

Field: inclusion, Full Agreement Rate: 100.00%
Field: exclusion_reason, Full Agreement Rate: 99.00%
Field: is_survivor, Full Agreement Rate: 100.00%
Field: is_survivor_and_taking_med, Full Agreement Rate: 100.00%
Field: family_cancer_history, Full Agreement Rate: 100.00%
Field: cancer_type, Full Agreement Rate: 99.00%
Field: other_cancer_type, Full Agreement Rate: 99.00%
Field: is_survivor_weight_loss, Full Agreement Rate: 100.00%
Field: cancer_diagnosis_after_medication, Full Agreement Rate: 100.00%
Field: mentions_cancer_risk, Full Agreement Rate: 98.00%
Field: concerned_about_cancer_risk, Full Agreement Rate: 97.00%
Field: seeking_cancer_risk_data, Full Agreement Rate: 100.00%
Field: can_assess_misinformation, Full Agreement Rate: 94.00%
Field: sentiment_score, Full Agreement Rate: 100.00%
Field: discussed_risk_with_physician, Full Agreement Rate: 100.00%
Field: discussion_GLP1_decreasing_cancer_risk, Full Agreement Rate: 98.00%

Overall Full Agreement Rat

In [None]:
import json
import pandas as pd
from itertools import combinations
from statsmodels.stats.inter_rater import fleiss_kappa
import numpy as np

# List of runs
run_files = [
    'run_1_results.json',
    'run_2_results.json',
    'run_3_results.json',
    'run_4_results.json',
    'run_5_results.json'
]

# Fields to evaluate (excluding 'post_id' and 'original_post')
fields = [
    "inclusion",
    "exclusion_reason",
    "is_survivor",
    "is_survivor_and_taking_med",
    "family_cancer_history",
    "cancer_type",
    "other_cancer_type",
    "is_survivor_weight_loss",
    "cancer_diagnosis_after_medication",
    "mentions_cancer_risk",
    "concerned_about_cancer_risk",
    "seeking_cancer_risk_data",
    "can_assess_misinformation",
    "sentiment_score",
    "discussed_risk_with_physician",
    "discussion_GLP1_decreasing_cancer_risk"
]

# Load data from each run into a DataFrame
data_frames = []
for idx, file in enumerate(run_files):
    with open(file, 'r') as f:
        data = json.load(f)
        # Convert list of dicts to DataFrame
        df = pd.DataFrame(data)
        # Rename columns to include run number
        df = df[['post_id'] + fields]
        df = df.set_index('post_id')
        df.columns = [f"{col}_run{idx+1}" for col in df.columns]
        data_frames.append(df)

# Merge all runs on 'post_id'
merged_df = pd.concat(data_frames, axis=1)

# Function to compute pairwise match rates
def compute_pairwise_match_rates(df, fields, runs):
    from itertools import combinations

    run_indices = range(1, len(runs)+1)
    run_pairs = list(combinations(run_indices, 2))
    pairwise_results = {}

    for field in fields:
        match_rates = []
        for (run_a, run_b) in run_pairs:
            col_a = f"{field}_run{run_a}"
            col_b = f"{field}_run{run_b}"
            matches = df[col_a] == df[col_b]
            match_rate = matches.mean() * 100  # Convert to percentage
            match_rates.append(match_rate)
        pairwise_results[field] = match_rates

    return pairwise_results, run_pairs

# Function to compute Fleiss' Kappa
def compute_fleiss_kappa(df, fields, num_runs):
    kappa_results = {}
    for field in fields:
        # Prepare data for Fleiss' Kappa
        categories = df[[f"{field}_run{run}" for run in range(1, num_runs+1)]]
        # Convert categorical data to numerical codes
        categories = categories.apply(lambda x: pd.factorize(x)[0])
        # Prepare the matrix required for Fleiss' Kappa
        category_counts = []
        for i in categories.index:
            counts = np.bincount(categories.loc[i], minlength=categories.max().max()+1)
            category_counts.append(counts)
        category_counts = np.array(category_counts)
        # Calculate Fleiss' Kappa
        kappa = fleiss_kappa(category_counts)
        kappa_results[field] = kappa
    return kappa_results

# Compute pairwise match rates
pairwise_match_rates, run_pairs = compute_pairwise_match_rates(merged_df, fields, run_files)

# Compute Fleiss' Kappa
kappa_results = compute_fleiss_kappa(merged_df, fields, num_runs=len(run_files))

# Display the results
print("Pairwise Match Rates (%):\n")
for field in fields:
    print(f"Field: {field}")
    for idx, match_rate in enumerate(pairwise_match_rates[field]):
        run_a, run_b = run_pairs[idx]
        print(f"  Run {run_a} vs Run {run_b}: {match_rate:.2f}%")
    average_match_rate = np.mean(pairwise_match_rates[field])
    print(f"  Average Pairwise Match Rate: {average_match_rate:.2f}%\n")

print("Fleiss' Kappa Results:\n")
for field, kappa in kappa_results.items():
    print(f"Field: {field}, Fleiss' Kappa: {kappa:.4f}")



Pairwise Match Rates (%):

Field: inclusion
  Run 1 vs Run 2: 100.00%
  Run 1 vs Run 3: 100.00%
  Run 1 vs Run 4: 100.00%
  Run 1 vs Run 5: 100.00%
  Run 2 vs Run 3: 100.00%
  Run 2 vs Run 4: 100.00%
  Run 2 vs Run 5: 100.00%
  Run 3 vs Run 4: 100.00%
  Run 3 vs Run 5: 100.00%
  Run 4 vs Run 5: 100.00%
  Average Pairwise Match Rate: 100.00%

Field: exclusion_reason
  Run 1 vs Run 2: 100.00%
  Run 1 vs Run 3: 99.00%
  Run 1 vs Run 4: 100.00%
  Run 1 vs Run 5: 99.00%
  Run 2 vs Run 3: 99.00%
  Run 2 vs Run 4: 100.00%
  Run 2 vs Run 5: 99.00%
  Run 3 vs Run 4: 99.00%
  Run 3 vs Run 5: 100.00%
  Run 4 vs Run 5: 99.00%
  Average Pairwise Match Rate: 99.40%

Field: is_survivor
  Run 1 vs Run 2: 100.00%
  Run 1 vs Run 3: 100.00%
  Run 1 vs Run 4: 100.00%
  Run 1 vs Run 5: 100.00%
  Run 2 vs Run 3: 100.00%
  Run 2 vs Run 4: 100.00%
  Run 2 vs Run 5: 100.00%
  Run 3 vs Run 4: 100.00%
  Run 3 vs Run 5: 100.00%
  Run 4 vs Run 5: 100.00%
  Average Pairwise Match Rate: 100.00%

Field: is_survivor_a