In [1]:
import pandas as pd
import numpy as np
import json
from tools import calculate_similarity, calculate_performance, calculate_performance_V2, pdf_to_text
import os

In [2]:
json_file = 'articlesJsonTemplateSteel.json'
if os.path.exists(json_file):
    raise FileExistsError("File already found")

df = pd.read_excel('documents/articles.xlsx', sheet_name='4340 Steel')

data = []
for index, row in df.iterrows():
    article = {
        "filename": f"{row['Index']}.pdf",
        "title": "" if pd.isna(row['Title']) else row['Title'],
        "link": "" if pd.isna(row['Link']) else row['Link'],
        "extractedText": ""
    }

    data.append(article)

with open('articlesJsonTemplateSteel.json', 'w') as f:
    json.dump(data, f, indent=4)

FileExistsError: File already found

In [3]:
json_file = 'articlesJsonTemplateSteel.json'
from openai import OpenAI

with open("../openAiToken.txt", "r") as key_file:
    api_key = key_file.read().strip()
os.environ["OPENAI_API_KEY"] = api_key  # Optionally set it as an environment variable
client = OpenAI(api_key=api_key)

In [None]:
MODEL = "o3-mini"
EFFORT = 'high'
MATERIAL = 'Steel'
INPUT_FILE = f"prompts/promptV2.5.txt"
OUTPUT_FILE = f'outputs/{MODEL}/promptv2.5_{EFFORT}_{MATERIAL}_3-24_early.json'

with open(f'articlesJsonTemplate{MATERIAL}.json') as f:
    articles = json.load(f)

for article in articles[:33]:
    print(article['filename'])
    inGroundTruth = False
    articlePresent = False

    if os.path.exists(f"documents/{MATERIAL}/{article['filename']}"): 
        articlePresent = True
    else:
        print('Skipping Because Article not found')
        print(f"documents/{MATERIAL}/{article['filename']}")

    if articlePresent:
        print(f'{article["filename"]} Ready for Analysis')
        input_pdf = f"documents/{MATERIAL}/{article['filename']}"
        article_text = pdf_to_text(input_pdf)
        with open(INPUT_FILE, "r") as file:
            prompt_text = file.read()
        messages=[ 
            {"role": "user", "content": prompt_text},
            {
                "role": "user",
                "content": article_text
            }
        ]
        try:
            completion = client.chat.completions.create(
                model=MODEL,
                messages=messages,
                reasoning_effort=EFFORT
            )
        except Exception as e:
            # Check for an error message that indicates the context is too large.
            if "string too long" in str(e).lower() or "token" in str(e).lower():
                print("Too large of Input")
                continue
            else:
                raise e

        extractedVals = completion.choices[0].message.content
        print(extractedVals)
        
        if extractedVals.startswith("```json") and extractedVals.endswith("```"):
            extractedVals = extractedVals[7:-3].strip()

        try:
            article['extractedText'] = json.loads(extractedVals)
        except json.JSONDecodeError as e:
            print(f"Saving as TXT - Error decoding JSON: {e}")
            article['extractedText'] = extractedVals
        except Exception as e:
            print("Another Error Occured")
            raise e
            # article['extractedText'] = extractedVals
        # article['extractedText'] = json.loads(extractedVals)
        with open(OUTPUT_FILE, 'w') as f:
            json.dump(articles, f, indent=4)
    

article_4340_1.pdf
article_4340_1.pdf Ready for Analysis
[
  {
    "feedstock_material": "AISI 4340 steel",
    "feedstock_material_series": "4340",
    "feedstock_material_composition": "NA",
    "substrate_material": "AISI 1018 steel",
    "substrate_material_series": "1018",
    "substrate_material_composition": "NA",
    "yield_strength_value": "NA",
    "yield_strength_units": "NA",
    "ultimate_tensile_strength_value": "NA",
    "ultimate_tensile_strength_units": "NA",
    "ductility_value": "NA",
    "ductility_units": "NA",
    "hardness_value": 561,
    "hardness_units": "HV",
    "modulus_value": "NA",
    "modulus_units": "NA",
    "notes": "Cold spray (CS) deposit without any laser heating."
  },
  {
    "feedstock_material": "AISI 4340 steel",
    "feedstock_material_series": "4340",
    "feedstock_material_composition": "NA",
    "substrate_material": "AISI 1018 steel",
    "substrate_material_series": "1018",
    "substrate_material_composition": "NA",
    "yield_streng

In [12]:
MODEL = "o3-mini"
EFFORT = 'high'
MATERIAL = 'Steel'


ARTICLES_JSON = "/Users/sprice/Documents/GitHub/coldSprayTextExtraction/3-23-25/outputs/o3-mini/promptv2_high_Steel_3-23.json"
# GROUND_TRUTH_JSON = "misc/groundTruthArticles.json"
SECOND_PASS_PROMPT_FILE = "/Users/sprice/Documents/GitHub/coldSprayTextExtraction/3-23-25/prompts/promptV2_processExtraction.txt"
OUTPUT_JSON = '/Users/sprice/Documents/GitHub/coldSprayTextExtraction/3-23-25/outputs/o3-mini/promptv2_high_Steel_extractedProperties_3-23.json'
##########################################
# Main Script
##########################################

def main():
    # 1) Load the articles JSON (already has mechanical properties)
    with open(ARTICLES_JSON, "r") as f:
        articles = json.load(f)

    # with open(GROUND_TRUTH_JSON, "r") as f:
    #     ground_truth_articles = json.load(f)

    with open(SECOND_PASS_PROMPT_FILE, "r") as file:
        second_pass_prompt_template = file.read()

    for article in articles:
        filename = article.get("filename", "UNKNOWN")
        print(f"Processing article: {filename}")

        # Skip certain articles by filename pattern
        # if "56" in filename or "89" in filename:
        #     print("Skipping due to filename containing '56' or '89'.")
        #     continue

        # Check if article is in ground truth
        # base_name = filename.split('.')[0]
        # if base_name in ground_truth_articles.get('Articles', []):
        #     print("Skipping because article is in ground truth.")
        #     continue

        # Check for PDF existence
        pdf_path = f"documents/{MATERIAL}/{filename}"
        if not os.path.exists(pdf_path):
            print("Skipping because PDF not found.")
            continue

        # 5) Extract text from the PDF
        print(f"Extracting text from: {pdf_path}")
        article_text = pdf_to_text(pdf_path)

        # 6) If the article has 'extractedText' as a list, it means
        #    we have multiple experiments from the first pass
        all_experiments = article.get("extractedText", [])
        if not isinstance(all_experiments, list):
            print("No experiments to process (extractedText is not a list).")
            continue

        # 7) For each experiment, run the second pass
        for experiment in all_experiments:
            # Convert the single experiment to JSON to insert in the prompt
            experiment_str = json.dumps(experiment, ensure_ascii=False)

            # Build the final prompt for the second pass
            second_pass_prompt = (
                f"{second_pass_prompt_template}\n\n"
                f"Here is the article:\n{article_text}\n\n"
                f"Here are the experimental results you must identify the processing parameters for:\n"
                f"{experiment_str}"
            )

            # Send to the LLM
            messages = [
                {"role": "user", "content": second_pass_prompt}
            ]

            print(f"Sending second-pass prompt for {filename} / experiment:\n{experiment}")
            
            # Example usage - adjust as needed for your LLM client
            completion = client.chat.completions.create(
                model=MODEL,
                messages=messages,
                reasoning_effort=EFFORT
            )
            model_response = completion.choices[0].message.content.strip()

            # Strip any triple backticks
            if model_response.startswith("```json") and model_response.endswith("```"):
                model_response = model_response[7:-3].strip()

            # Try to parse the returned JSON
            try:
                parsed_params = json.loads(model_response)
            except json.JSONDecodeError:
                print("Error: Could not parse second-pass JSON. Storing raw string.")
                parsed_params = model_response

            # Attach the returned processing parameters to the experiment
            experiment["processing_parameters"] = parsed_params

        # (Optional) Write out partial results after each article, so we don't lose progress
        with open(OUTPUT_JSON, "w") as out_f:
            json.dump(articles, out_f, indent=4)

    print("All done! Final file saved at:", OUTPUT_JSON)


if __name__ == "__main__":
    main()


Processing article: article_4340_1.pdf
Extracting text from: documents/Steel/article_4340_1.pdf
Sending second-pass prompt for article_4340_1.pdf / experiment:
{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 561, 'hardness_units': 'HV', 'modulus_value': 'NA', 'modulus_units': 'NA', 'notes': 'Cold spray (CS) deposit without laser heating; average hardness measured from a typical CS process.'}
Sending second-pass prompt for article_4340_1.pdf / experiment:
{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 466, 'hardness_units': 'HV', 'modulus_value': 'NA', 'modulus_units': 'NA', 'notes': 'Laser-assisted cold spray (LACS) deposit achieved with in situ laser heating to a 