In [1]:
import pandas as pd
import numpy as np
import json
from tools import calculate_similarity, calculate_performance, calculate_performance_V2, pdf_to_text
import os

In [2]:
json_file = 'articlesJsonTemplate.json'
from openai import OpenAI

with open("../openAiToken.txt", "r") as key_file:
    api_key = key_file.read().strip()
os.environ["OPENAI_API_KEY"] = api_key  # Optionally set it as an environment variable
client = OpenAI(api_key=api_key)

In [10]:
modelName = "o3-mini"

INPUT_FILE = "misc/promptV2.txt"
OUTPUT_FILE = f'modelOutputs3-11/{modelName}_promptv2_high_gt.json'

with open('articlesJsonTemplate.json') as f:
    articles = json.load(f)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

for article in articles:
    print(article['filename'])
    inGroundTruth = False
    articlePresent = False
    if article['filename'].split('.')[0] in [gt_article for gt_article in ground_truth_articles['Articles']]:
        # print(f"Article {article['filename']} is present in ground truth articles")
        inGroundTruth = True
    else:
        print('Skipping because not in groundtruth')

    if os.path.exists(f'documents/{article["filename"]}'): 
        articlePresent = True
    else:
        print('Skipping Because Article not found')

    if inGroundTruth and articlePresent:
        print(f'{article["filename"]} Ready for Analysis')
        input_pdf = f"documents/{article['filename']}"
        article_text = pdf_to_text(input_pdf)
        with open(INPUT_FILE, "r") as file:
            prompt_text = file.read()
        messages=[ 
            {"role": "user", "content": prompt_text},
            {
                "role": "user",
                "content": article_text
            }
        ]
        completion = client.chat.completions.create(
            model=modelName,
            messages=messages,
            reasoning_effort='high'
        )
        extractedVals = completion.choices[0].message.content
        print(extractedVals)
        
        if extractedVals.startswith("```json") and extractedVals.endswith("```"):
            extractedVals = extractedVals[7:-3].strip()

        try:
            article['extractedText'] = json.loads(extractedVals)
        except json.JSONDecodeError as e:
            print(f"Saving as TXT - Error decoding JSON: {e}")
            article['extractedText'] = extractedVals
        except Exception as e:
            print("Another Error Occured")
            raise e
            # article['extractedText'] = extractedVals
        # article['extractedText'] = json.loads(extractedVals)
        with open(OUTPUT_FILE, 'w') as f:
            json.dump(articles, f, indent=4)
    

Article_1.pdf
Article_1.pdf Ready for Analysis
[
  {
    "yield_strength_value": "NA",
    "yield_strength_units": "NA",
    "ultimate_tensile_strength_value": "NA",
    "ultimate_tensile_strength_units": "NA",
    "ductility_value": "NA",
    "ductility_units": "NA",
    "hardness_value": "NA",
    "hardness_units": "NA",
    "modulus_value": 61.4,
    "modulus_units": "GPa",
    "notes": "Cold-sprayed Al 6061 tensile specimens (in-plane L/T orientation) were tested using ASTM E8/E111; only the elastic modulus (61.4 ±0.3 GPa) was explicitly reported, while yield strength, ultimate tensile strength, ductility, and hardness were not provided."
  }
]
Article_2.pdf
Skipping Because Article not found
Article_3.pdf
Article_3.pdf Ready for Analysis
[
  {
    "yield_strength_value": 151,
    "yield_strength_units": "MPa",
    "ultimate_tensile_strength_value": 311,
    "ultimate_tensile_strength_units": "MPa",
    "ductility_value": "NA",
    "ductility_units": "NA",
    "hardness_value": 85,

In [None]:
import os

modelName = "o1-mini"

with open(f'modelOutputs3-11/{modelName}_promptv2_gt.json', 'r') as file:
    data = json.load(file)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

losses = []

for row in data:
    rowName = row['filename'].split('.')[0]
    rowResults = row.get('extractedText')
    if rowResults != "":
        # rowResults_json = json.dumps(rowResults, indent=2)
        ground_truth_article = ground_truth_articles.get("Articles", {}).get(rowName)
        
        if ground_truth_article is not None:

            print(f"Ground Truth for {rowName}:")
            print(type(ground_truth_article))
            print(ground_truth_article)
            print('-'*30)
            print(f"Extracted for {rowName}:")
            print(type(rowResults))
            print(rowResults)
            results = calculate_performance(ground_truth_article, rowResults)
            losses.append(results['Overall Loss'])
            for i in results:
                print(i)
                print(results[i])
                print('-'*30)
            
        else:
            print(f"No ground truth found for '{rowName}'.\n")
    print('\n'*3)
print('Average Loss: ', np.mean(losses))



Ground Truth for Article_1:
<class 'list'>
[{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}]
------------------------------
Extracted for Article_1:
<class 'list'>
[{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'Measured from L/T standard-sized cold-sprayed Al 6061 tensile specimens.'}]
Field RMSE
{'yield_strength_value': np.float64(1.0), 'ultimate_tensile_strengt

In [7]:
import os

modelName = "o1-mini"

with open(f'modelOutputs3-11/{modelName}_promptv4_gt.json', 'r') as file:
    data = json.load(file)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

losses = []

for row in data:
    rowName = row['filename'].split('.')[0]
    rowResults = row.get('extractedText')
    if rowResults != "":
        # rowResults_json = json.dumps(rowResults, indent=2)
        ground_truth_article = ground_truth_articles.get("Articles", {}).get(rowName)
        
        if ground_truth_article is not None:

            print(f"Ground Truth for {rowName}:")
            print(type(ground_truth_article))
            print(ground_truth_article)
            print('-'*30)
            print(f"Extracted for {rowName}:")
            print(type(rowResults))
            print(rowResults)
            results = calculate_performance(ground_truth_article, rowResults)
            losses.append(results['Overall Loss'])
            for i in results:
                print(i)
                print(results[i])
                print('-'*30)
            
        else:
            print(f"No ground truth found for '{rowName}'.\n")
    print('\n'*3)
print('Average Loss: ', np.mean(losses))



Ground Truth for Article_1:
<class 'list'>
[{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}]
------------------------------
Extracted for Article_1:
<class 'list'>
[{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'Mean elastic modulus for cold-sprayed AL 6061 tensile specimens.'}, {'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strengt

In [8]:
import os

modelName = "o3-mini"

with open(f'modelOutputs/{modelName}_promptv2_gt.json', 'r') as file:
    data = json.load(file)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

losses = []

for row in data:
    rowName = row['filename'].split('.')[0]
    rowResults = row.get('extractedText')
    if rowResults != "":
        # rowResults_json = json.dumps(rowResults, indent=2)
        ground_truth_article = ground_truth_articles.get("Articles", {}).get(rowName)
        
        if ground_truth_article is not None:

            print(f"Ground Truth for {rowName}:")
            print(type(ground_truth_article))
            print(ground_truth_article)
            print('-'*30)
            print(f"Extracted for {rowName}:")
            print(type(rowResults))
            print(rowResults)
            results = calculate_performance(ground_truth_article, rowResults)
            losses.append(results['Overall Loss'])
            for i in results:
                print(i)
                print(results[i])
                print('-'*30)
            
        else:
            print(f"No ground truth found for '{rowName}'.\n")
    print('\n'*3)
print('Average Loss: ', np.mean(losses))



Ground Truth for Article_1:
<class 'list'>
[{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}]
------------------------------
Extracted for Article_1:
<class 'list'>
[{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'For cold‐sprayed Al 6061 deposits, only the elastic modulus was explicitly reported (61.4 ± 0.3 GPa in the in‐plane L/T orientation). Yield strength, tens

In [11]:
import os

modelName = "o3-mini"

with open(f'modelOutputs3-11/{modelName}_promptv2_high_gt.json', 'r') as file:
    data = json.load(file)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

losses = []

for row in data:
    rowName = row['filename'].split('.')[0]
    rowResults = row.get('extractedText')
    if rowResults != "":
        # rowResults_json = json.dumps(rowResults, indent=2)
        ground_truth_article = ground_truth_articles.get("Articles", {}).get(rowName)
        
        if ground_truth_article is not None:

            print(f"Ground Truth for {rowName}:")
            print(type(ground_truth_article))
            print(ground_truth_article)
            print('-'*30)
            print(f"Extracted for {rowName}:")
            print(type(rowResults))
            print(rowResults)
            results = calculate_performance(ground_truth_article, rowResults)
            losses.append(results['Overall Loss'])
            for i in results:
                print(i)
                print(results[i])
                print('-'*30)
            
        else:
            print(f"No ground truth found for '{rowName}'.\n")
    print('\n'*3)
print('Average Loss: ', np.mean(losses))



Ground Truth for Article_1:
<class 'list'>
[{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}]
------------------------------
Extracted for Article_1:
<class 'list'>
[{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'Cold-sprayed Al 6061 tensile specimens (in-plane L/T orientation) were tested using ASTM E8/E111; only the elastic modulus (61.4 ±0.3 GPa) was explicitly 

In [12]:
modelName = "o3-mini"

INPUT_FILE = "misc/promptV2.txt"
OUTPUT_FILE = f'modelOutputs3-11/{modelName}_promptv2_high_nongt.json'

with open('articlesJsonTemplate.json') as f:
    articles = json.load(f)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

for article in articles:
    print(article['filename'])
    if '56' not in article['filename'] and '89' not in article['filename']:
        inGroundTruth = False
        articlePresent = False
        if article['filename'].split('.')[0] in [gt_article for gt_article in ground_truth_articles['Articles']]:
            # print(f"Article {article['filename']} is present in ground truth articles")
            inGroundTruth = True
            print('Skipping because in groundtruth')
        else:
            # print('Skipping because not in groundtruth')
            pass

        if os.path.exists(f'documents/{article["filename"]}'): 
            articlePresent = True
        else:
            print('Skipping Because Article not found')

        if articlePresent and not inGroundTruth:
            print(f'{article["filename"]} Ready for Analysis')
            input_pdf = f"documents/{article['filename']}"
            article_text = pdf_to_text(input_pdf)
            with open(INPUT_FILE, "r") as file:
                prompt_text = file.read()
            messages=[ 
                {"role": "user", "content": prompt_text},
                {
                    "role": "user",
                    "content": article_text
                }
            ]
            completion = client.chat.completions.create(
                model=modelName,
                messages=messages,
                reasoning_effort='high'
            )
            extractedVals = completion.choices[0].message.content
            print(extractedVals)
            
            if extractedVals.startswith("```json") and extractedVals.endswith("```"):
                extractedVals = extractedVals[7:-3].strip()


            try:
                article['extractedText'] = json.loads(extractedVals)
            except json.JSONDecodeError as e:
                print(f"Saving as TXT - Error decoding JSON: {e}")
                article['extractedText'] = extractedVals
            except Exception as e:
                print("Another Error Occured")
                raise e
                # article['extractedText'] = extractedVals
            # article['extractedText'] = json.loads(extractedVals)
            with open(OUTPUT_FILE, 'w') as f:
                json.dump(articles, f, indent=4)
        

Article_1.pdf
Skipping because in groundtruth
Article_2.pdf
Skipping because in groundtruth
Skipping Because Article not found
Article_3.pdf
Skipping because in groundtruth
Article_4.pdf
Skipping because in groundtruth
Article_5.pdf
Skipping because in groundtruth
Article_6.pdf
Article_6.pdf Ready for Analysis
[
  {
    "yield_strength_value": "NA",
    "yield_strength_units": "NA",
    "ultimate_tensile_strength_value": "NA",
    "ultimate_tensile_strength_units": "NA",
    "ductility_value": "NA",
    "ductility_units": "NA",
    "hardness_value": 1.24,
    "hardness_units": "GPa",
    "modulus_value": "NA",
    "modulus_units": "NA",
    "notes": "Average nanoindentation hardness for gas‐atomized Al 6061 powder in the 25–32 μm size range, measured via nanoindentation."
  },
  {
    "yield_strength_value": "NA",
    "yield_strength_units": "NA",
    "ultimate_tensile_strength_value": "NA",
    "ultimate_tensile_strength_units": "NA",
    "ductility_value": "NA",
    "ductility_units"

In [13]:
import json

with open('misc/groundTruthArticles.json', 'r') as f1, open('modelOutputs3-11/o3-mini_promptv2_high_nongt.json', 'r') as f2:
    ground_truth_articles = json.load(f1)
    non_gt = json.load(f2)

for article in non_gt:
    print(article['filename'])
    inGroundTruth = False
    articlePresent = False
    if article['filename'].split('.')[0] in [gt_article for gt_article in ground_truth_articles['Articles']]:
        article['extractedText'] = ground_truth_articles['Articles'][article['filename'].split('.')[0]]
        article['ground_truth'] = True
    else:
        article['ground_truth'] = False

    with open('modelOutputs3-11/o3-mini_promptv2_high_merged.json', 'w') as f:
        json.dump(non_gt, f, indent=4)

Article_1.pdf
Article_2.pdf
Article_3.pdf
Article_4.pdf
Article_5.pdf
Article_6.pdf
Article_7.pdf
Article_8.pdf
Article_9.pdf
Article_10.pdf
Article_11.pdf
Article_12.pdf
Article_13.pdf
Article_14.pdf
Article_15.pdf
Article_16.pdf
Article_17.pdf
Article_18.pdf
Article_19.pdf
Article_20.pdf
Article_21.pdf
Article_22.pdf
Article_23.pdf
Article_24.pdf
Article_25.pdf
Article_26.pdf
Article_27.pdf
Article_28.pdf
Article_29.pdf
Article_30.pdf
Article_31.pdf
Article_32.pdf
Article_33.pdf
Article_34.pdf
Article_35.pdf
Article_36.pdf
Article_37.pdf
Article_38.pdf
Article_39.pdf
Article_40.pdf
Article_41.pdf
Article_42.pdf
Article_43.pdf
Article_44.pdf
Article_45.pdf
Article_46.pdf
Article_47.pdf
Article_48.pdf
Article_49.pdf
Article_50.pdf
Article_51.pdf
Article_52.pdf
Article_53.pdf
Article_54.pdf
Article_55.pdf
Article_56.pdf
Article_57.pdf
Article_58.pdf
Article_59.pdf
Article_60.pdf
Article_61.pdf
Article_62.pdf
Article_63.pdf
Article_64.pdf
Article_65.pdf
Article_66.pdf
Article_67.pdf
Arti