In [1]:
import pandas as pd
import numpy as np
import json
from tools import calculate_similarity, calculate_performance, calculate_performance_V2, pdf_to_text
import os

In [2]:
json_file = 'articlesJsonTemplate.json'
from openai import OpenAI

with open("../openAiToken.txt", "r") as key_file:
    api_key = key_file.read().strip()
os.environ["OPENAI_API_KEY"] = api_key  # Optionally set it as an environment variable
client = OpenAI(api_key=api_key)

In [8]:
modelName = "o3-mini"
ARTICLES_JSON = "/Users/sprice/Documents/GitHub/coldSprayTextExtraction/productionalized/modelOutputs3-11/o3-mini_promptv2_high_merged.json"
GROUND_TRUTH_JSON = "misc/groundTruthArticles.json"
SECOND_PASS_PROMPT_FILE = "/Users/sprice/Documents/GitHub/coldSprayTextExtraction/productionalized/misc/promptV2_processExtraction.txt"
OUTPUT_JSON = f"/Users/sprice/Documents/GitHub/coldSprayTextExtraction/productionalized/modelOutputs3-11/{modelName}_promptv2_high_merged_withParams.json"

##########################################
# Main Script
##########################################

def main():
    # 1) Load the articles JSON (already has mechanical properties)
    with open(ARTICLES_JSON, "r") as f:
        articles = json.load(f)

    with open(GROUND_TRUTH_JSON, "r") as f:
        ground_truth_articles = json.load(f)

    with open(SECOND_PASS_PROMPT_FILE, "r") as file:
        second_pass_prompt_template = file.read()

    for article in articles:
        filename = article.get("filename", "UNKNOWN")
        print(f"Processing article: {filename}")

        # Skip certain articles by filename pattern
        if "56" in filename or "89" in filename:
            print("Skipping due to filename containing '56' or '89'.")
            continue

        # Check if article is in ground truth
        # base_name = filename.split('.')[0]
        # if base_name in ground_truth_articles.get('Articles', []):
        #     print("Skipping because article is in ground truth.")
        #     continue

        # Check for PDF existence
        pdf_path = f"documents/{filename}"
        if not os.path.exists(pdf_path):
            print("Skipping because PDF not found.")
            continue

        # 5) Extract text from the PDF
        print(f"Extracting text from: {pdf_path}")
        article_text = pdf_to_text(pdf_path)

        # 6) If the article has 'extractedText' as a list, it means
        #    we have multiple experiments from the first pass
        all_experiments = article.get("extractedText", [])
        if not isinstance(all_experiments, list):
            print("No experiments to process (extractedText is not a list).")
            continue

        # 7) For each experiment, run the second pass
        for experiment in all_experiments:
            # Convert the single experiment to JSON to insert in the prompt
            experiment_str = json.dumps(experiment, ensure_ascii=False)

            # Build the final prompt for the second pass
            second_pass_prompt = (
                f"{second_pass_prompt_template}\n\n"
                f"Here is the article:\n{article_text}\n\n"
                f"Here are the experimental results you must identify the processing parameters for:\n"
                f"{experiment_str}"
            )

            # Send to the LLM
            messages = [
                {"role": "user", "content": second_pass_prompt}
            ]

            print(f"Sending second-pass prompt for {filename} / experiment:\n{experiment}")
            
            # Example usage - adjust as needed for your LLM client
            completion = client.chat.completions.create(
                model=modelName,
                messages=messages,
                reasoning_effort='high'
            )
            model_response = completion.choices[0].message.content.strip()

            # Strip any triple backticks
            if model_response.startswith("```json") and model_response.endswith("```"):
                model_response = model_response[7:-3].strip()

            # Try to parse the returned JSON
            try:
                parsed_params = json.loads(model_response)
            except json.JSONDecodeError:
                print("Error: Could not parse second-pass JSON. Storing raw string.")
                parsed_params = model_response

            # Attach the returned processing parameters to the experiment
            experiment["processing_parameters"] = parsed_params

        # (Optional) Write out partial results after each article, so we don't lose progress
        with open(OUTPUT_JSON, "w") as out_f:
            json.dump(articles, out_f, indent=4)

    print("All done! Final file saved at:", OUTPUT_JSON)


if __name__ == "__main__":
    main()


Processing article: Article_1.pdf
Extracting text from: documents/Article_1.pdf
Sending second-pass prompt for Article_1.pdf / experiment:
{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}
Processing article: Article_2.pdf
Skipping because PDF not found.
Processing article: Article_3.pdf
Extracting text from: documents/Article_3.pdf
Sending second-pass prompt for Article_3.pdf / experiment:
{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 100, 'ultimate_tensile_strength_units': 'MPa', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 85, 'hardness_