In [1]:
from langchain_google_genai.llms import GoogleGenerativeAI
import pandas as pd
import json

from tqdm import tqdm
from IPython.display import display, Markdown

import os
import time
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
llm = GoogleGenerativeAI(model="gemini-2.5-flash")

In [4]:
# response = llm.invoke("Summarise the following Tolkien quote in simple English in a paragraph: 'Not all those who wander are lost.'")

In [5]:
# display(Markdown(response))

In [6]:
quotes_df = pd.read_json("quotes_with_prompt.json", lines=True).T
quotes_df.rename(columns={0:"info"}, inplace=True)
quotes_df

Unnamed: 0,info
0,"{'quote': 'All that is gold does not glitter, ..."
1,{'quote': 'Not all those who wander are lost. ...
2,{'quote': 'I wish it need not have happened in...
3,{'quote': 'I don't know half of you half as we...
4,{'quote': 'All we have to decide is what to do...
...,...
2989,{'quote': 'He that breaks a thing to find out ...
2990,"{'quote': 'Home is behind, the world ahead, An..."
2991,{'quote': 'There is more in you of good than y...
2992,"{'quote': 'Good Morning!"" said Bilbo, and he m..."


In [7]:
type(quotes_df), type(quotes_df["info"]), type(quotes_df["info"][0]), type(quotes_df["info"][0]["prompt_template"])

(pandas.core.frame.DataFrame, pandas.core.series.Series, dict, str)

In [8]:
quotes_df["info"][0]

{'quote': 'All that is gold does not glitter,\nNot all those who wander are lost;\nThe old that is strong does not wither,\nDeep roots are not reached by the frost.\n\nFrom the ashes a fire shall be woken,\nA light from the shadows shall spring;\nRenewed shall be blade that was broken,\nThe crownless again shall be king.\n',
 'prompt_template': 'Summarize the following quote in simple English in a paragraph: All that is gold does not glitter,\nNot all those who wander are lost;\nThe old that is strong does not wither,\nDeep roots are not reached by the frost.\n\nFrom the ashes a fire shall be woken,\nA light from the shadows shall spring;\nRenewed shall be blade that was broken,\nThe crownless again shall be king.\n'}

In [9]:
# indexing format: quotes_df[column][row][dict_key] -> str
quotes_df["info"][0]["prompt_template"]

'Summarize the following quote in simple English in a paragraph: All that is gold does not glitter,\nNot all those who wander are lost;\nThe old that is strong does not wither,\nDeep roots are not reached by the frost.\n\nFrom the ashes a fire shall be woken,\nA light from the shadows shall spring;\nRenewed shall be blade that was broken,\nThe crownless again shall be king.\n'

In [10]:
# constants
output_dir = "summaries/"
batch_size = 100

In [11]:
# make sure summaries folder exists
os.makedirs(output_dir, exist_ok=True)

In [12]:
# Load full quotes from original file
with open("quotes_with_prompt.json", "r", encoding="utf-8") as f:
    quotes_raw = json.load(f)

# Ensure 'quotes' is a list, even if saved as a dict
quotes = list(quotes_raw.values()) if isinstance(quotes_raw, dict) else quotes_raw

# If summaries exist, merge them in
if os.path.exists("quotes_with_summary.json"):
    with open("quotes_with_summary.json", "r", encoding="utf-8") as f:
        processed_raw = json.load(f)

    processed_quotes = list(processed_raw.values()) if isinstance(processed_raw, dict) else processed_raw

    for i in range(min(len(quotes), len(processed_quotes))):
        if "summary" in processed_quotes[i]:
            quotes[i]["summary"] = processed_quotes[i]["summary"]

# Initialize with all quotes that already have summaries
quotes_with_summary = [quote for quote in quotes if "summary" in quote]

# Start batch processing
for start in range(0, len(quotes), batch_size):
    end = min(start + batch_size, len(quotes))
    print(f"\nProcessing batch {start} to {end - 1}")

    batch = []

    for i in tqdm(range(start, end)):
        quote = quotes[i]

        # Skip already summarized quotes
        if "summary" in quote:
            batch.append(quote)
            continue

        # Generate summary using the LLM
        summary = llm.invoke(quote["prompt_template"])
        quote["summary"] = summary
        quotes[i] = quote

        batch.append(quote)
        quotes_with_summary.append(quote)

        time.sleep(2)

    # Save current batch
    batch_file = os.path.join(output_dir, f"summary_{start}_{end - 1}.json")
    with open(batch_file, "w", encoding="utf-8") as f:
        json.dump(batch, f, ensure_ascii=False)
    print(f"Saved batch to '{batch_file}'")

    # Save all processed quotes so far
    with open("quotes_with_summary.json", "w", encoding="utf-8") as f:
        json.dump(quotes_with_summary, f, ensure_ascii=False)
    print("Updated quotes_with_summary.json")


Processing batch 0 to 99


100%|██████████| 100/100 [00:00<00:00, 49795.84it/s]


Saved batch to 'summaries/summary_0_99.json'
Updated quotes_with_summary.json

Processing batch 100 to 199


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_100_199.json'
Updated quotes_with_summary.json

Processing batch 200 to 299


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_200_299.json'
Updated quotes_with_summary.json

Processing batch 300 to 399


100%|██████████| 100/100 [00:00<?, ?it/s]

Saved batch to 'summaries/summary_300_399.json'





Updated quotes_with_summary.json

Processing batch 400 to 499


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_400_499.json'
Updated quotes_with_summary.json

Processing batch 500 to 599


100%|██████████| 100/100 [00:00<00:00, 49760.40it/s]

Saved batch to 'summaries/summary_500_599.json'





Updated quotes_with_summary.json

Processing batch 600 to 699


100%|██████████| 100/100 [00:00<?, ?it/s]

Saved batch to 'summaries/summary_600_699.json'





Updated quotes_with_summary.json

Processing batch 700 to 799


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_700_799.json'
Updated quotes_with_summary.json

Processing batch 800 to 899


100%|██████████| 100/100 [00:00<00:00, 99179.57it/s]


Saved batch to 'summaries/summary_800_899.json'
Updated quotes_with_summary.json

Processing batch 900 to 999


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_900_999.json'
Updated quotes_with_summary.json

Processing batch 1000 to 1099


100%|██████████| 100/100 [00:00<00:00, 49807.67it/s]

Saved batch to 'summaries/summary_1000_1099.json'





Updated quotes_with_summary.json

Processing batch 1100 to 1199


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_1100_1199.json'
Updated quotes_with_summary.json

Processing batch 1200 to 1299


100%|██████████| 100/100 [00:00<00:00, 49713.22it/s]


Saved batch to 'summaries/summary_1200_1299.json'
Updated quotes_with_summary.json

Processing batch 1300 to 1399


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_1300_1399.json'
Updated quotes_with_summary.json

Processing batch 1400 to 1499


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_1400_1499.json'
Updated quotes_with_summary.json

Processing batch 1500 to 1599


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_1500_1599.json'
Updated quotes_with_summary.json

Processing batch 1600 to 1699


100%|██████████| 100/100 [00:00<?, ?it/s]


Saved batch to 'summaries/summary_1600_1699.json'
Updated quotes_with_summary.json

Processing batch 1700 to 1799


100%|██████████| 100/100 [12:46<00:00,  7.66s/it]


Saved batch to 'summaries/summary_1700_1799.json'
Updated quotes_with_summary.json

Processing batch 1800 to 1899


100%|██████████| 100/100 [13:12<00:00,  7.93s/it]


Saved batch to 'summaries/summary_1800_1899.json'
Updated quotes_with_summary.json

Processing batch 1900 to 1999


  0%|          | 0/100 [00:02<?, ?it/s]


KeyboardInterrupt: 