In [None]:
import os
import pandas as pd

# Configure the Groq client
from groq import Groq

# Set your Groq API key
api_key = "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"  # Replace with your actual API key
if not api_key:
    raise ValueError("Please set your Groq API key in the 'api_key' variable.")

client = Groq(api_key=api_key)

# File to read and write
input_file_path = "/kaggle/input/filtereddata/filtered_data.csv"  # Input file path
output_file_path = "/kaggle/working/outputCsvUpd2.csv"  # Output file path

# Load the CSV file
df = pd.read_csv(input_file_path, low_memory=False)

# Remove unintended empty rows
df = df.dropna(how="all").reset_index(drop=True)

# Define columns to clean
columns_to_clean = ["Study Title", "Primary Outcome Measures", "Secondary Outcome Measures", "criteria"]

# Merge all columns into a single column for processing
def merge_columns(row):
    return " \n".join([str(row[col]) for col in columns_to_clean if col in row and not pd.isnull(row[col])])

df["Merged_Content"] = df.apply(merge_columns, axis=1)

# Function to clean and extract relationships using Groq
def extract_relationships(content, row_index):
    if pd.isnull(content):
        return [None, None, None, None, None]

    # Truncate content if it exceeds a certain limit to avoid API errors
    max_length = 2000  # Adjust the limit as needed
    if len(content) > max_length:
        content = content[:max_length] + "..."

    prompt = (
        "The following text is a merged representation of a clinical trials dataset row. "
        "Extract the following relationships from it in the format: \n"
        "Subject, Relationship, Object (one per line): \n"
        "1. involves (Disease): \n"
        "2. evaluates (Drug): \n"
        "3. measures_primary (Primary Outcome): \n"
        "4. measures_secondary (Secondary Outcome): \n"
        "5. has_criteria (Criteria): \n"
        "Ensure each extracted Object contains a maximum of 5 words. \n"
        f"\n{content}"
    )

    # Call the Groq API to get a response
    completion = client.chat.completions.create(
        model="gemma2-9b-it",  # Use the appropriate model
        messages=[{"role": "user", "content": prompt}],
        temperature=1,
        max_tokens=1024,
        top_p=1,
        stream=False,
        stop=None,
    )

    # Extract the response text from the correct object
    response = completion.choices[0].message.content

    # Parse the response text to extract the relationships
    extracted = [None, None, None, None, None]
    for line in response.splitlines():
        if "involves" in line:
            extracted[0] = line.split(",", 2)[-1].strip()
        elif "evaluates" in line:
            extracted[1] = line.split(",", 2)[-1].strip()
        elif "measures_primary" in line:
            extracted[2] = line.split(",", 2)[-1].strip()
        elif "measures_secondary" in line:
            extracted[3] = line.split(",", 2)[-1].strip()
        elif "has_criteria" in line:
            extracted[4] = line.split(",", 2)[-1].strip()

    print(f"Row {row_index} processed.")  # Log row completion
    return extracted

# Ensure output file is cleared or created
with open(output_file_path, 'w') as f:
    f.write("Subject,Relationship,Object\n")

# Process each row, extract relationships, and write to the CSV immediately
for index, row in df.iloc[129:].iterrows():
    extracted = extract_relationships(row["Merged_Content"], index)
    subject = row.get("NCT Number", f"Row {index}")  # Replace "Study ID" with appropriate column name for Subject

    # Write each relationship as a separate row in the output file
    with open(output_file_path, 'a') as f:
        f.write(f"{subject},involves,{extracted[0]}\n")
        f.write(f"{subject},evaluates,{extracted[1]}\n")
        f.write(f"{subject},measures_primary,{extracted[2]}\n")
        f.write(f"{subject},measures_secondary,{extracted[3]}\n")
        f.write(f"{subject},has_criteria,{extracted[4]}\n")

    print(f"Row {index} written to file.")

print(f"Data extraction complete. Updated data saved to: {output_file_path}")


In [None]:
import pandas as pd
import re
from rapidfuzz import fuzz, process  # For fuzzy string matching

# File paths
input_file_path = "/kaggle/input/outputcsvupd/outputCsvUpd.csv"  # Input dataset
output_file_path = "/kaggle/working/processed_outputCSVUpd.csv"   # Output dataset

# Load the dataset
df = pd.read_csv(input_file_path)

# Custom lemmatization function
def custom_lemmatizer(word):
    """
    A lightweight custom lemmatizer for basic normalization.
    This can handle common cases like plural/singular forms and known transformations.
    """
    # Basic singular/plural transformations
    word = word.lower()
    if word.endswith('ies'):
        word = word[:-3] + 'y'
    elif word.endswith('es'):
        word = word[:-2]
    elif word.endswith('s') and len(word) > 1:
        word = word[:-1]
    
    # Additional custom rules (add as needed)
    lemma_mapping = {
        "alzheimer's": "alzheimer",
        "diseases": "disease",
        "diabetics": "diabetes",
    }
    
    return lemma_mapping.get(word, word)

# Preprocessing function for the Object column
def preprocess_text(text):
    if pd.isnull(text) or text.lower() == "none":
        return text
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Apply custom lemmatization to each word
    text = " ".join([custom_lemmatizer(word) for word in text.split()])
    return text

# Function to normalize text using fuzzy matching
def normalize_text(text, reference_list, threshold=85):
    """
    Normalize text using fuzzy matching against a reference list.
    Args:
        text (str): The text to normalize.
        reference_list (list): A list of reference strings to match against.
        threshold (int): The minimum similarity score for a match.
    Returns:
        str: The normalized text if a match is found, otherwise the original text.
    """
    match_result = process.extractOne(text, reference_list, scorer=fuzz.ratio)
    if match_result and match_result[1] >= threshold:
        return match_result[0]  # Return the matched text
    return text  # Return the original text if no match is above the threshold

# Build a reference list from the Object column
reference_list = df["Object"].dropna().unique()

# Ensure output file is cleared or created
with open(output_file_path, 'w') as f:
    f.write(",".join(df.columns) + "\n")  # Write headers

# Process each row, apply preprocessing and normalization, and write to the file immediately
for index, row in df.iterrows():
    original_text = row["Object"]
    preprocessed_text = preprocess_text(original_text)
    normalized_text = normalize_text(preprocessed_text, reference_list)
    row["Object"] = normalized_text
    with open(output_file_path, 'a') as f:
        f.write(",".join(map(str, row.values)) + "\n")
    print(f"Row {index} processed and written to file: {normalized_text}")

print(f"Processing complete. Processed dataset saved to {output_file_path}.")
