In [17]:
import pandas as pd
import numpy as np
import json
import os
from difflib import SequenceMatcher

In [20]:
import requests
import json

def get_crossref_metadata(title):
    url = "https://api.crossref.org/works"
    params = {
        "query.bibliographic": title,
        "rows": 1  # return the top result
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        items = data.get("message", {}).get("items", [])
        if items:
            return items[0]
    return None

def reduce_metadata(metadata):
    # Extract the title from the 'title' list (if available)
    title_list = metadata.get("title", [])
    title = title_list[0] if title_list else "NA"
    
    # Extract abstract (if available)
    abstract = metadata.get("abstract", "NA")
    
    # Extract date published from the 'issued' field
    issued = metadata.get("issued", {})
    date_parts = issued.get("date-parts", [])
    if date_parts and len(date_parts[0]) > 0:
        date_published = "-".join(str(part) for part in date_parts[0])
    else:
        date_published = "NA"
        
    # Extract DOI
    doi = metadata.get("DOI", "NA")
    ref_count = metadata.get("reference-count", "NA")
    citations = metadata.get("is-referenced-by-count", "NA")
    container_title = metadata.get("container-title", [])
    journal = container_title[0] if container_title else "NA"
    date_updated = metadata.get("indexed", {}).get("date-time", "NA")
    
    return {
        "Title": title,
        "Abstract": abstract,
        "Date Published": date_published,
        "DOI": doi,
        "Number of References": ref_count,
        "Number of Citations": citations,
        "Journal": journal,
        "Date Updated (of metadata)": date_updated
    }

def text_similarity(text1, text2):
    return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()

json_file = 'articlesMetaData.json'
if os.path.exists(json_file):
    print('Skipping Creation: File Already Exists')
else:
    df = pd.read_excel('documents/articles.xlsx', sheet_name='6061 Al')

    data = []
    for index, row in df.iterrows():
        article = {
            "filename": f"Article_{row['Index']}.pdf",
            "title": "" if pd.isna(row['Title']) else row['Title'],
            "link": "" if pd.isna(row['Link']) else row['Link'],
            "extractedText": ""
        }
        data.append(article)
    with open(json_file, 'w') as f:
        json.dump(data, f, indent=4)


In [22]:
input_json_path = "articlesMetaData.json"

# Read the JSON file
with open(input_json_path, 'r', encoding='utf-8') as infile:
    articles = json.load(infile)

# Process each article: get metadata from Crossref and add a new "metadata" field
for article in articles:
    article_title = article.get("title", "")
    if article_title:
        crossref_metadata = get_crossref_metadata(article_title)
        if crossref_metadata:
            reduced = reduce_metadata(crossref_metadata)
            returned_title = reduced.get("Title", "NA")
            similarity = text_similarity(article_title, returned_title)
            if similarity < 0.9:
                article["metadata"] = "No metadata found"
            else:
                article["metadata"] = reduced
        else:
            article["metadata"] = "No metadata found"
    else:
        article["metadata"] = "No title provided"

# Save the new JSON to a file (you can change the output path as needed)
output_json_path = "articlesMetaDataCrossRef.json"
with open(output_json_path, 'w', encoding='utf-8') as outfile:
    json.dump(articles, outfile, indent=4)

print(f"Processed articles saved to {output_json_path}")


Processed articles saved to articlesMetaDataCrossRef.json
