In [None]:
cd ../..

In [None]:
import pandas as pd
import random
import time

from mistralai import Mistral
from mistralai.models import SDKError
from tqdm import tqdm

tqdm.pandas()

# Settings

In [None]:
TEXTS_FILE = "data/SB_publication_PMC_texts.parquet"
OUTPUT_FILE = "data/SB_publication_PMC_texts_main_ideas.parquet"

# Read texts file

In [None]:
texts = pd.read_parquet(TEXTS_FILE)
texts.head()

In [None]:
texts["len"] = texts["text"].apply(len)

In [None]:
MIN_LEN = 20_000
texts_long = texts.copy()
texts_long = texts_long[texts_long["len"] > MIN_LEN].reset_index(drop=True)
texts_long = texts_long[["title", "text"]]

# Generate main ideas

In [None]:
model = "magistral-small-latest"
api_key = ""

client = Mistral(api_key=api_key)

def call_model(query: str) -> str:
  chat_response = client.chat.complete(
  model = model,
  messages = [{"role": "user", "content": query}])
  return chat_response.choices[0].message.content[-1].text

def safe_call_model(prompt, max_retries=5):
    for attempt in range(max_retries):
        try:
            return call_model(prompt)
        except SDKError as e:
            if "rate limit" in str(e).lower():
                sleep_time = 60 + random.random()
                print(f"Rate limit hit. Sleeping for {sleep_time:.1f}s...")
                time.sleep(sleep_time)
            else:
                raise
    raise RuntimeError("Max retries exceeded due to rate limiting")

In [None]:
system_prompt = """
Exploring the Moon and Mars safely and efficiently requires understanding how humans, plants, and other living systems respond to the space environment. NASA’s Biological and Physical Sciences Division (BPS) funds research to address key scientific questions that enable future human exploration. These in-space experiments have produced numerous discoveries that inform mission planning, risk assessment, and technology development.

However, the vast number of publications describing these studies makes it difficult to grasp the overall progress, identify research gaps, or extract actionable insights. Modern approaches in AI and informatics can help summarize and organize this information more effectively.

Your task:
- You will receive a scientific article related to space biology or physical sciences in space. Read it carefully and produce a concise summary of its key ideas.

Output requirements:
- Output with 3 sections: Goal, Methodology and Conclusions.
- 1 sentence per section.
- No overall title.

Write in a tone suitable for:
- Scientists developing new hypotheses
- NASA program managers identifying research opportunities
- Mission architects designing safe and efficient Moon/Mars exploration strategies.

Output example:

- **Goal**: blah blah blah
- **Methodology**: blah blah blah
- **Conclusions**: blah blah blah

ARTICLE TO REVIEW:

"""

In [None]:
texts_long["main_ideas"] = texts_long["text"].progress_apply(lambda x: safe_call_model(system_prompt + x))
texts_long.head()

In [None]:
texts = texts.merge(texts_long, on="title", how="left").fillna("")
texts.head()

In [None]:
texts.to_parquet(OUTPUT_FILE)