In [None]:
import os
import pandas as pd

# Configure the Groq client
from groq import Groq

# Set your Groq API key
api_key = "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"  # Replace with your actual API key
if not api_key:
    raise ValueError("Please set your Groq API key in the 'api_key' variable.")

client = Groq(api_key=api_key)

# File to read and write
input_file_path = "/content/trials - Copy.csv"  # Input file path
output_file_path = "refined.csv"  # Output file path

# Load the CSV file
df = pd.read_csv(input_file_path, low_memory=False)

# Remove unintended empty rows
df = df.dropna(how="all").reset_index(drop=True)

# Define columns to clean
columns_to_clean = ["Study Title", "Primary Outcome Measures", "Secondary Outcome Measures", "criteria"]

# Merge all columns into a single column for processing
def merge_columns(row):
    return " \n".join([str(row[col]) for col in columns_to_clean if col in row and not pd.isnull(row[col])])

df["Merged_Content"] = df.apply(merge_columns, axis=1)

# Function to clean and extract relationships using Groq
def extract_relationships(content, row_index):
    if pd.isnull(content):
        return [None, None, None, None, None]

    # Truncate content if it exceeds a certain limit to avoid API errors
    max_length = 2000  # Adjust the limit as needed
    if len(content) > max_length:
        content = content[:max_length] + "..."

    

    prompt = (
          "The following text is a merged representation of a clinical trials dataset row. "
          "Extract only valid and concise relationships from it in the exact format below:\n"
          "'Subject', 'Relationship', 'Object' (one per line):\n\n"
          "'involves (Disease)': 'The name of the primary disease or condition being studied "
          '(e.g., "Diabetes", "Hypertension").\'\n\n'
          "'evaluates (Drug)': 'The name of the drug, therapy, or intervention being evaluated. "
          '(e.g., "Aspirin", "Gene Therapy"). \'\n\n'
          "'measures_primary (Primary Outcome)': 'The main outcome or result the trial is designed "
          'to measure, in up to 5 words (e.g., "Reduction in blood pressure").\'\n\n'
          "'measures_secondary (Secondary Outcome)': 'Additional outcomes or results measured in the trial, "
          'in up to 5 words (e.g., "Improved insulin sensitivity").\'\n\n'
          "'has_criteria (Criteria)': 'Key eligibility criteria for participants in up to 5 words "
          '(e.g., "Ages 18-65, no diabetes").\'\n\n'
          "Rules:\n\n"
          "- 'Assume you are a biomedical expert with a deep understanding of clinical trial terminology.'\n"
          # "- 'If the information for an object cannot be found or is ambiguous, write \"None\" instead of guessing.'\n"
          "- 'Objects must be relevant, concise, and meaningful, with a maximum of 5 words. Avoid long phrases, generic terms, or irrelevant outputs.'\n\n"
          "'Content to analyze\n'"
          f"\n{content}"
)

    # Call the Groq API to get a response
    completion = client.chat.completions.create(
        model="gemma2-9b-it",  # Use the appropriate model
        messages=[{"role": "user", "content": prompt}],
        temperature=1,
        max_tokens=1024,
        top_p=1,
        stream=False,
        stop=None,
    )

    # Extract the response text from the correct object
    response = completion.choices[0].message.content

    # Parse the response text to extract the relationships
    extracted = [None, None, None, None, None]
    for line in response.splitlines():
        if "involves" in line:
            extracted[0] = line.split(",", 2)[-1].strip()
        elif "evaluates" in line:
            extracted[1] = line.split(",", 2)[-1].strip()
        elif "measures_primary" in line:
            extracted[2] = line.split(",", 2)[-1].strip()
        elif "measures_secondary" in line:
            extracted[3] = line.split(",", 2)[-1].strip()
        elif "has_criteria" in line:
            extracted[4] = line.split(",", 2)[-1].strip()

    print(f"Row {row_index} processed.")  # Log row completion
    return extracted

# Ensure output file is cleared or created
with open(output_file_path, 'w') as f:
    f.write("Subject,Relationship,Object\n")

# Process each row, extract relationships, and write to the CSV immediately
for index, row in df.iloc[0:].iterrows():
    extracted = extract_relationships(row["Merged_Content"], index)
    subject = row.get("NCT Number", f"Row {index}")  # Replace "Study ID" with appropriate column name for Subject

    # Write each relationship as a separate row in the output file
    with open(output_file_path, 'a') as f:
        f.write(f"{subject},involves,{extracted[0]}\n")
        f.write(f"{subject},evaluates,{extracted[1]}\n")
        f.write(f"{subject},measures_primary,{extracted[2]}\n")
        f.write(f"{subject},measures_secondary,{extracted[3]}\n")
        f.write(f"{subject},has_criteria,{extracted[4]}\n")

    print(f"Row {index} written to file.")

print(f"Data extraction complete. Updated data saved to: {output_file_path}")


In [None]:
## OM God's Code

import os
import pandas as pd
from groq import Groq

# Configure Groq client
api_key = "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"  # Replace with your actual API key
client = Groq(api_key=api_key)

# File paths
input_file_path = "/content/data_300.csv"
output_file_path = "refined4.csv"

# Load and prepare data
df = pd.read_csv(input_file_path, low_memory=False).dropna(how="all")

# Merge relevant columns
columns_to_merge = ["Study Title", "Primary Outcome Measures", 
                   "Secondary Outcome Measures", "criteria"]
df["Merged_Content"] = df[columns_to_merge].apply(
    lambda row: " \n".join(row.values.astype(str)), axis=1
)

# Improved prompt template
PROMPT_TEMPLATE = (
    "You are a clinical trial data expert. Extract relationships STRICTLY in this format:\n"
    "RELATIONSHIP[TAB]OBJECT\n\n"
    "Relationships to extract:\n"
    "- involves: Disease/condition name\n"
    "- evaluates: Drug/intervention name\n"
    "- measures_primary: Primary outcome (≤5 words)\n"
    "- measures_secondary: Secondary outcome (≤5 words)\n"
    "- has_criteria: Eligibility criteria (≤5 words)\n\n"
    "Rules:\n"
    "1. OBJECT must be ONLY the extracted value - no labels, quotes, or prefixes\n"
    "2. Use exact medical terminology from the text\n"
    "3. Skip relationships if information is missing\n"
    "4. Use TAB separator between relationship and object\n\n"
    "Example output:\n"
    "involves\tAlzheimer's Disease\n"
    "evaluates\tIntravenous Sabirnetug\n\n"
    "Process this clinical trial data:\n{content}"
)

def extract_relationships(content):
    """Process content and return list of (relationship, object) tuples"""
    if pd.isnull(content) or not content.strip():
        return []

    # Truncate long content
    content = str(content)
    if len(content) > 2000:
        content = content[:2000] + "... [TRUNCATED]"

    try:
        completion = client.chat.completions.create(
            model="gemma2-9b-it",
            messages=[{
                "role": "user",
                "content": PROMPT_TEMPLATE.format(content=content)
            }],
            temperature=0.3,
            max_tokens=1024
        )
        response = completion.choices[0].message.content
        
        # Parse response lines
        relationships = []
        for line in response.splitlines():
            if "\t" in line:
                rel, obj = line.split("\t", 1)
                rel = rel.strip().lower()
                obj = obj.strip(" '\"")  # Clean quotes
                
                # Validate relationships
                if rel in {'involves', 'evaluates', 'measures_primary', 
                          'measures_secondary', 'has_criteria'} and obj:
                    relationships.append((rel, obj))
        
        return relationships
    
    except Exception as e:
        print(f"Error processing content: {str(e)}")
        return []

# Initialize output file
with open(output_file_path, 'w') as f:
    f.write("Subject,Relationship,Object\n")

# Process rows and write results
for index, row in df.iterrows():
    subject_id = row.get("NCT Number", f"ROW_{index}")
    relationships = extract_relationships(row["Merged_Content"])
    
    if relationships:
        with open(output_file_path, 'a') as f:
            for rel, obj in relationships:
                f.write(f"{subject_id},{rel},{obj}\n")
    
    print(f"Processed row {index} - Extracted {len(relationships)} relationships")

print(f"Processing complete. Results saved to: {output_file_path}")


In [None]:
## Cleaning the csv for extra columns


import csv

# Define the input and output file paths
input_file = '/content/refined4 (1).csv'
output_file = '/content/refined4_cleaned.csv'  # Save to a different file to avoid overwriting prematurely

# Process the CSV to keep only the first three columns
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for row in reader:
        # Check if the row has at least three columns before slicing
        if len(row) >= 3:
            writer.writerow(row[:3])

# Read the updated CSV file
try:
    df = pd.read_csv(output_file)

    # Get the value counts for the 'Object' column
    object_value_counts = df['Object'].value_counts()

    # Display the value counts
    print(object_value_counts)

    # Save the value counts to a CSV file
    object_value_counts.to_csv('Object_Value_Counts2.csv', header=['Count'])
except pd.errors.EmptyDataError:
    print("The cleaned CSV file is empty or invalid. Please check the input file.")
