In [4]:
import pandas as pd
import numpy as np
import json
from tools import calculate_similarity, calculate_performance, pdf_to_text
import os

In [8]:
json_file = 'articlesJsonTemplate.json'
if os.path.exists(json_file):
    raise FileExistsError("File already found")

df = pd.read_excel('articles.xlsx')

data = []
for index, row in df.iterrows():
    article = {
        "filename": f"Article_{row['Index']}.pdf",
        "title": "" if pd.isna(row['Title']) else row['Title'],
        "link": "" if pd.isna(row['Link']) else row['Link'],
        "extractedText": ""
    }

    data.append(article)

with open('articlesJsonTemplate.json', 'w') as f:
    json.dump(data, f, indent=4)

# Initial Model Demo Test

In [None]:
from openai import OpenAI

with open("../openAiToken.txt", "r") as key_file:
    api_key = key_file.read().strip()
os.environ["OPENAI_API_KEY"] = api_key  # Optionally set it as an environment variable
client = OpenAI(api_key=api_key)

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)
print(completion.choices[0].message)


ChatCompletionMessage(content='Functions call themselves,  \nDepth in logic’s endless path,  \nPatterns loop like dreams.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [22]:
print(completion.choices[0].message)

[
  {
    "yield_strength_value": 262.0,
    "yield_strength_units": "MPa",
    "ultimate_tensile_strength_value": 286.8,
    "ultimate_tensile_strength_units": "MPa",
    "ductility_value": 2.0,
    "ductility_units": "%",
    "hardness_value": 105.0,
    "hardness_units": "HV",
    "modulus_value": 67.5,
    "modulus_units": "GPa",
    "notes": "As-sprayed condition with average measurements from five discreet samples"
  },
  {
    "yield_strength_value": 147.5,
    "yield_strength_units": "MPa",
    "ultimate_tensile_strength_value": 195.1,
    "ultimate_tensile_strength_units": "MPa",
    "ductility_value": 13.0,
    "ductility_units": "%",
    "hardness_value": 65.7,
    "hardness_units": "HV",
    "modulus_value": 64.1,
    "modulus_units": "GPa",
    "notes": "Annealed condition with average values from three different tensile tests"
  },
  {
    "yield_strength_value": 203.4,
    "yield_strength_units": "MPa",
    "ultimate_tensile_strength_value": 216.5,
    "ultimate_tensile_

# Article Demo Test

### Note: This is using GPT-4o-Mini, it is not expected to do well

In [12]:
input_pdf = "documents/Article_8.pdf"
article_text = pdf_to_text(input_pdf)
with open("misc/promptV2.txt", "r") as file:
    prompt_text = file.read()

messages=[ 
    {"role": "system", "content": prompt_text},
    {
        "role": "user",
        "content": article_text
    }
]
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)
print(completion.choices[0].message.content)


ChatCompletionMessage(content='[\n  {\n    "yield_strength_value": 262.0,\n    "yield_strength_units": "MPa",\n    "ultimate_tensile_strength_value": 286.8,\n    "ultimate_tensile_strength_units": "MPa",\n    "ductility_value": 2.0,\n    "ductility_units": "%",\n    "hardness_value": 105.0,\n    "hardness_units": "HV",\n    "modulus_value": 67.5,\n    "modulus_units": "GPa",\n    "notes": "As-sprayed condition with average measurements from five discreet samples"\n  },\n  {\n    "yield_strength_value": 147.5,\n    "yield_strength_units": "MPa",\n    "ultimate_tensile_strength_value": 195.1,\n    "ultimate_tensile_strength_units": "MPa",\n    "ductility_value": 13.0,\n    "ductility_units": "%",\n    "hardness_value": 65.7,\n    "hardness_units": "HV",\n    "modulus_value": 64.1,\n    "modulus_units": "GPa",\n    "notes": "Annealed condition with average values from three different tensile tests"\n  },\n  {\n    "yield_strength_value": 203.4,\n    "yield_strength_units": "MPa",\n    "ul

# Initial Model Perforamnce Analysis

In [25]:
completion.choices[0].message

ChatCompletionMessage(content='```json\n[\n    {\n        "yield_strength_value": 190,\n        "yield_strength_units": "MPa",\n        "ultimate_tensile_strength_value": 310,\n        "ultimate_tensile_strength_units": "MPa",\n        "ductility_value": "NA",\n        "ductility_units": "%",\n        "hardness_value": 100,\n        "hardness_units": "HV",\n        "modulus_value": "NA",\n        "modulus_units": "NA",\n        "notes": "As-atomized condition"\n    },\n    {\n        "yield_strength_value": 229,\n        "yield_strength_units": "MPa",\n        "ultimate_tensile_strength_value": "NA",\n        "ultimate_tensile_strength_units": "NA",\n        "ductility_value": "NA",\n        "ductility_units": "NA",\n        "hardness_value": "NA",\n        "hardness_units": "NA",\n        "modulus_value": "NA",\n        "modulus_units": "NA",\n        "notes": "Heat-treated condition after powder annealing"\n    },\n    {\n        "yield_strength_value": 77,\n        "yield_strength_u

In [62]:
# modelName = "gpt-4o-mini"
# modelName = "gpt-4o"
modelName = "o1-mini"
# modelName = "o1-preview"

# INPUT_FILE = "misc/promptV2.txt"
INPUT_FILE = "misc/promptV4.txt"
# OUTPUT_FILE = f'modelOutputs/{modelName}.json'
OUTPUT_FILE = f'modelOutputs/{modelName}_v4.json'

with open('articlesJsonTemplate.json') as f:
    articles = json.load(f)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

for article in articles:
    print(article['filename'])
    inGroundTruth = False
    articlePresent = False
    if article['filename'].split('.')[0] in [gt_article for gt_article in ground_truth_articles['Articles']]:
        # print(f"Article {article['filename']} is present in ground truth articles")
        inGroundTruth = True
    else:
        print('Skipping because not in groundtruth')

    if os.path.exists(f'documents/{article["filename"]}'): 
        articlePresent = True
    else:
        print('Skipping Because Article not found')

    if inGroundTruth and articlePresent:
        print(f'{article["filename"]} Ready for Analysis')
        input_pdf = f"documents/{article['filename']}"
        article_text = pdf_to_text(input_pdf)
        with open(INPUT_FILE, "r") as file:
            prompt_text = file.read()
        messages=[ 
            {"role": "user", "content": prompt_text},
            {
                "role": "user",
                "content": article_text
            }
        ]
        completion = client.chat.completions.create(
            model=modelName,
            messages=messages
        )
        extractedVals = completion.choices[0].message.content
        print(extractedVals)
        
        if extractedVals.startswith("```json") and extractedVals.endswith("```"):
            extractedVals = extractedVals[7:-3].strip()


        try:
            article['extractedText'] = json.loads(extractedVals)
        except json.JSONDecodeError as e:
            print(f"Saving as TXT - Error decoding JSON: {e}")
            article['extractedText'] = extractedVals
        except Exception as e:
            print("Another Error Occured")
            raise e
            # article['extractedText'] = extractedVals
        # article['extractedText'] = json.loads(extractedVals)
        with open(OUTPUT_FILE, 'w') as f:
            json.dump(articles, f, indent=4)
    

Article_1.pdf
Article_1.pdf Ready for Analysis
```json
[
  {
    "yield_strength_value": "NA",
    "yield_strength_units": "MPa",
    "ultimate_tensile_strength_value": "NA",
    "ultimate_tensile_strength_units": "MPa",
    "ductility_value": "NA",
    "ductility_units": "%",
    "hardness_value": "NA",
    "hardness_units": "NA",
    "modulus_value": "NA",
    "modulus_units": "GPa",
    "notes": "Yield strength, tensile strength, and elongation were 16–17%, 10–11%, and 77–78% lower than wrought Al 6061, respectively."
  },
  {
    "Extracted_paper_author": "Scott E. Julien, Ahmad Nourian-Avvala, Wentao Liang, Tricia Schwartz, Ozan C. Ozdemir, Sinan Müftü",
    "Extracted_publication_year": "2022",
    "graph_verification_required": "LOW"
  }
]
```
Article_2.pdf
Skipping Because Article not found
Article_3.pdf
Article_3.pdf Ready for Analysis
```json
[
  {
    "yield_strength_value": "NA",
    "yield_strength_units": "NA",
    "ultimate_tensile_strength_value": 107,
    "ultimate_ten

In [None]:
with open(f'modelOutputs/{modelName}.json', 'r') as file:
    data = json.load(file)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

losses = []

for row in data:
    rowName = row['filename'].split('.')[0]
    rowResults = row.get('extractedText')
    if rowResults != "":
        # rowResults_json = json.dumps(rowResults, indent=2)
        ground_truth_article = ground_truth_articles.get("Articles", {}).get(rowName)
        
        if ground_truth_article is not None:

            print(f"Ground Truth for {rowName}:")
            print(type(ground_truth_article))
            print(ground_truth_article)
            print('-'*30)
            print(f"Extracted for {rowName}:")
            print(type(rowResults))
            print(rowResults)
            results = calculate_performance(ground_truth_article, rowResults)
            losses.append(results['Overall Loss'])
            for i in results:
                print(i)
                print(results[i])
                print('-'*30)
            
        else:
            print(f"No ground truth found for '{rowName}'.\n")
    print('\n'*3)
print('Average Loss: ', np.mean(losses))

Ground Truth for Article_1:
<class 'list'>
[{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}]
------------------------------
Extracted for Article_1:
<class 'list'>
[{'yield_strength_value': 252, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 283, 'ultimate_tensile_strength_units': 'MPa', 'ductility_value': 1, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'Tensile properties measured in the two in-plane deposit directions (long-rastering and stepping)'}, {'yield_strength_value': 240, 'yield_strength_units': 'MPa

In [61]:
with open(f'modelOutputs/{modelName}.json', 'r') as file:
    data = json.load(file)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

losses = []

for row in data:
    rowName = row['filename'].split('.')[0]
    rowResults = row.get('extractedText')
    if rowResults != "":
        # rowResults_json = json.dumps(rowResults, indent=2)
        ground_truth_article = ground_truth_articles.get("Articles", {}).get(rowName)
        
        if ground_truth_article is not None:

            print(f"Ground Truth for {rowName}:")
            print(type(ground_truth_article))
            print(ground_truth_article)
            print('-'*30)
            print(f"Extracted for {rowName}:")
            print(type(rowResults))
            print(rowResults)
            results = calculate_performance(ground_truth_article, rowResults)
            losses.append(results['Overall Loss'])
            for i in results:
                print(i)
                print(results[i])
                print('-'*30)
            
        else:
            print(f"No ground truth found for '{rowName}'.\n")
    print('\n'*3)
print('Average Loss: ', np.mean(losses))

Ground Truth for Article_1:
<class 'list'>
[{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}]
------------------------------
Extracted for Article_1:
<class 'list'>
[{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 'NA', 'modulus_units': 'NA', 'notes': 'NA'}]
Field RMSE
{'yield_strength_value': np.float64(1.0), 'ultimate_tensile_strength_value': 0, 'ductility_value': np.float64(1.0), 'hardness_value': 0, '

In [59]:
modelName = 'o1-mini'

with open(f'modelOutputs/{modelName}.json', 'r') as file:
    data = json.load(file)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

losses = []

for row in data:
    rowName = row['filename'].split('.')[0]
    rowResults = row.get('extractedText')
    if rowResults != "":
        # rowResults_json = json.dumps(rowResults, indent=2)
        ground_truth_article = ground_truth_articles.get("Articles", {}).get(rowName)
        
        if ground_truth_article is not None:

            print(f"Ground Truth for {rowName}:")
            print(type(ground_truth_article))
            print(ground_truth_article)
            print('-'*30)
            print(f"Extracted for {rowName}:")
            print(type(rowResults))
            print(rowResults)
            results = calculate_performance(ground_truth_article, rowResults)
            losses.append(results['Overall Loss'])
            for i in results:
                print(i)
                print(results[i])
                print('-'*30)
            
        else:
            print(f"No ground truth found for '{rowName}'.\n")
    print('\n'*3)
print('Average Loss: ', np.mean(losses))

Ground Truth for Article_1:
<class 'list'>
[{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}]
------------------------------
Extracted for Article_1:
<class 'list'>
[{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'Mean elastic modulus for L/T cold sprayed Al 6061 specimens'}]
Field RMSE
{'yield_strength_value': np.float64(1.0), 'ultimate_tensile_strength_value': 0, 

In [58]:
with open(f'modelOutputs/{modelName}.json', 'r') as file:
    data = json.load(file)

with open('misc/groundTruthArticles.json') as f:
    ground_truth_articles = json.load(f)

losses = []

for row in data:
    rowName = row['filename'].split('.')[0]
    rowResults = row.get('extractedText')
    if rowResults != "":
        # rowResults_json = json.dumps(rowResults, indent=2)
        ground_truth_article = ground_truth_articles.get("Articles", {}).get(rowName)
        
        if ground_truth_article is not None:

            print(f"Ground Truth for {rowName}:")
            print(type(ground_truth_article))
            print(ground_truth_article)
            print('-'*30)
            print(f"Extracted for {rowName}:")
            print(type(rowResults))
            print(rowResults)
            results = calculate_performance(ground_truth_article, rowResults)
            losses.append(results['Overall Loss'])
            for i in results:
                print(i)
                print(results[i])
                print('-'*30)
            
        else:
            print(f"No ground truth found for '{rowName}'.\n")
    print('\n'*3)
print('Average Loss: ', np.mean(losses))

Ground Truth for Article_1:
<class 'list'>
[{'yield_strength_value': 260, 'yield_strength_units': 'MPa', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 2.6, 'ductility_units': '%', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'He-sprayed Al6061 sample (Al-2-1) with improved particle impact velocity; shows significant tensile strength increase and ductility.'}]
------------------------------
Extracted for Article_1:
<class 'list'>
[{'yield_strength_value': 'NA', 'yield_strength_units': 'NA', 'ultimate_tensile_strength_value': 'NA', 'ultimate_tensile_strength_units': 'NA', 'ductility_value': 'NA', 'ductility_units': 'NA', 'hardness_value': 'NA', 'hardness_units': 'NA', 'modulus_value': 61.4, 'modulus_units': 'GPa', 'notes': 'Elastic modulus measured for L/T cold-sprayed Al 6061 specimens, sprayed using helium as the accelerating gas.'}]
Field RMSE
{'yield_strength_value': np.f