<a href="https://colab.research.google.com/github/simeonwetzel/notebooks/blob/main/Eartharxiv_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Requirements

- Osf-token for fetching records from eartharxiv --> Get osf-token here: https://osf.io/

- Groq API-key for LLM inference --> Get free acount and API-key here: https://groq.com/

In [None]:
from getpass import getpass

osf_token = getpass("OSF_TOKEN=")

OSF_TOKEN=··········


In [None]:
import os
os.environ["GROQ_API_KEY"] = getpass("Enter your Groq API key: ")

Enter your Groq API key: ··········


Install necessary packages:

In [None]:
!pip install -qU langchain-groq

#1. Download eartharxiv dump

In [None]:
import requests
from tqdm import tqdm

def fetch_all_preprints(base_url: str, params: dict):
    all_records = []  # This will hold all collected records
    page = 1  # Start from the first page
    total_pages = 1  # Initialize total_pages (will update based on response)

    # First request to find the total number of pages (if available)
    response = requests.get(url=base_url, params=params)
    if response.status_code != 200:
        print(f"Error fetching data: {response.status_code}")
        return []

    data = response.json()
    total_pages = data['links']['meta']['total'] / 10 # Update with the actual total pages

    # Now we know the total number of pages, so we can create the tqdm progress bar
    with tqdm(total=total_pages, desc="Fetching preprints") as pbar:
        while True:
            # Add pagination to the params
            params['page'] = page

            # Make the GET request
            response = requests.get(url=base_url, params=params)

            if response.status_code != 200:
                print(f"Error fetching data: {response.status_code}")
                break

            # Get the records from the response
            data = response.json()
            preprints = data['data']  # Assuming 'data' holds the records

            # Append the preprints to the all_records list
            all_records.extend(preprints)

            # Check if there is another page
            if 'next' not in data['links']:
                break  # No next page, we are done

            page += 1  # Move to the next page
            pbar.update(1)  # Update the progress bar for each page

    return all_records

# Example usage:
base_url = "https://api.osf.io/v2/preprints/"
params = {"filter[provider]": "eartharxiv"}

headers = {'Content-Type': 'application/json',
          'Authorization': 'Bearer {0}'.format(osf_token)}

# Fetching the preprints with the progress bar
all_preprints = fetch_all_preprints(base_url, params)

# Now all_preprints holds all the records
print(f"Total records fetched: {len(all_preprints)}")


  full_bar = Bar(frac,
Fetching preprints: 100%|██████████| 165/164.9 [01:30<00:00,  1.83it/s]

Error fetching data: 404
Total records fetched: 1649





#2. Extract spatial context with LLM

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import PromptTemplate
from langchain_core.exceptions import OutputParserException
from langchain.output_parsers import OutputFixingParser

class LocationContext(BaseModel):
    location: str = Field(description="A location. Can be a city, region, country or a continent.")
    bbox: list[str] = Field(description="Bounding Box in format [ymin, xmin, ymax, xmax]")

parser = JsonOutputParser(pydantic_object=LocationContext)

output_fixer = OutputFixingParser.from_llm(parser=parser, llm=llm)

prompt = PromptTemplate(
    template="""You are an expert in extracting the geographic references.
            Your task is to extract the location reference with a bounding box [ymin, xmin, ymax, xmax] that is most suitable to the revieced text.

            Format instructions: {format_instructions}

            Do not generate further textual explanations. Only output a JSON with location and bbox. Leave JSON values empty where you can't extract the information.
            Query: {input}
            """,
    input_variables=["input"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm

def extract_spatial_reference(input_text: str):
  output = chain.invoke(
      {
          "input": input_text,
      }
  )
  try:
    parsed_output = parser.parse(output.content)
  except OutputParserException as e:
    print(f"Output parsing failed. Trying to fix format.")
    print(f"Misformed output: {output.content}")
    parsed_output = output_fixer.parse(output.content)

  return parsed_output


In [None]:
# Function to get the title and abstract of a metadata record
def format_title_abstract(doc: dict) -> str:
  return f"{doc['attributes']['title']}: {doc['attributes']['description']}"

###3. Apply LLM chain to all fetched preprints

In [None]:
from tqdm import tqdm

def apply_spatial_referencing(preprints: list[dict]) -> list[dict]:
  updated_preprints = []
  for x in tqdm(preprints, desc="Extracting spatial context"):
      updated_preprints.append({**x, "spatial_context": extract_spatial_reference(format_title_abstract(x))})
  return updated_preprints

We test the function here with a subset of 50 records.

In [None]:
subset_preprints = apply_spatial_referencing(preprints=all_preprints[:50])

Extracting spatial context: 100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


#4. Display parsed information as pandas dataframe

In [None]:
formatted_dict = list(map(lambda x: {'id': x['id'],
                                     'title': x['attributes']['title'],
                                     'description': x['attributes']['description'],
                                     'AI_parsed_location': x['spatial_context']['location'] if x['spatial_context']['location'] else None,
                                     'AI_parsed_BBOX': x['spatial_context']['bbox'] if (x['spatial_context']['bbox'] and not any(item == '' for item in x['spatial_context']['bbox'])) else None}, subset_preprints))

In [None]:
import pandas as pd

df = pd.DataFrame(formatted_dict)
df

Unnamed: 0,id,title,description,AI_parsed_location,AI_parsed_BBOX
0,2p9wg,Uncertainties in Projected Rainfall over Brazi...,The aim of this study is to answer four main q...,Brazil,"[-22.5, -75, 10.5, -30]"
1,eayph,What does the NDVI really tell us about crops?...,The use of remote sensing in agriculture is ex...,,
2,n8hz7,"A continental-scale assessment of density, siz...",Farm dams are a cornerstone of modern agricult...,Australia,"[-44.1, 112.9, -9.1, 154.0]"
3,g2uxy,Structure and age relationship of joint sets o...,Outcrop studies of fracture networks are impor...,"Lilstock Benches, UK","[51.4374, 3.1544, 51.4574, 3.1744]"
4,ktcde,An analytical solution to the Navier–Stokes eq...,This paper is concerned with obtaining a formu...,,
5,qhtb6,The composition and weathering of the continen...,The composition of continental crust records a...,,
6,t8dm4,GARPOS: analysis software for the GNSS-A seafl...,Global Navigation Satellite System – Acoustic ...,,
7,2rxbn,Quantification of non-linear multiphase flow i...,We measure the pressure difference during two-...,,
8,huz73,Correcting 19th and 20th century sea surface t...,Changes in the statistics of North Atlantic hu...,North Atlantic,"[25.0, -75.0, 50.0, -40.0]"
9,ju26e,Identifying and correcting the World War 2 war...,Most foregoing estimates of historical sea sur...,,
