In [6]:
import os
import time
import pandas as pd
import openai
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def summarize_text(text, max_words=1000, max_retries=5):
    """
    Uses OpenAI's Chat API to summarize a text to a maximum of max_words words.
    On rate limit errors, waits 60 seconds before retrying.
    On input size errors, truncates the last 100 words and retries.
    """
    retries = 0
    current_text = text
    while retries < max_retries:
        prompt = (
            f"Please summarize the following text so that the final summary is no longer than {max_words} words. "
            "Include all key details in a concise manner.\n\n"
            f"Text:\n{current_text}\n\n"
            "Summary:"
        )
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.5,
            )
            return response.choices[0].message["content"].strip()
        except Exception as e:
            error_str = str(e)
            if "Rate limit reached" in error_str:
                # Wait for 60 seconds and then retry
                print(f"Rate limit reached: {error_str}")
                print("Waiting for 60 seconds before retrying...")
                time.sleep(60)
                retries += 1
                continue
            elif "Request too large" in error_str or "input or output tokens must be reduced" in error_str:
                # Truncate the last 100 words from current_text and try again
                words = current_text.split()
                if len(words) > 100:
                    current_text = " ".join(words[:-100])
                    print("Input too large. Truncated the text by removing the last 100 words and retrying...")
                    retries += 1
                    continue
                else:
                    raise Exception("Text too short to truncate further. " + error_str)
            else:
                # For other errors, raise the exception
                raise e
    raise Exception("Max retries exceeded.")

def main():
    # Load the Parquet file containing the summaries
    df = pd.read_parquet("final_categorized_with_themes_and_summaries.parquet")
    
    # Ensure that there is a "summary" column in your dataframe
    if "summary" not in df.columns:
        print("No 'summary' column found in the dataframe.")
        return

    condensed_summaries = []
    
    # Iterate over each row and summarize the summary
    for idx, row in df.iterrows():
        original_summary = row["summary"]
        print(f"Processing row {idx}...")
        try:
            condensed = summarize_text(original_summary, max_words=1000)
            condensed_summaries.append(condensed)
            print(f"Row {idx} summarized successfully.")
        except Exception as e:
            print(f"Error summarizing row {idx}: {e}")
            condensed_summaries.append("Error in summarization.")
        # Pause briefly (if not rate-limited already)
        time.sleep(1)
    
    # Add the condensed summaries to the dataframe
    df["condensed_summary"] = condensed_summaries
    
    # Save the new dataframe to a CSV file
    df.to_csv("summarized_summaries.csv", index=False)
    print("All summaries processed and saved to summarized_summaries.csv")
    return df

if __name__ == "__main__":
    df = main()


Processing row 0...
Row 0 summarized successfully.
Processing row 1...
Row 1 summarized successfully.
Processing row 2...
Row 2 summarized successfully.
Processing row 3...
Row 3 summarized successfully.
Processing row 4...
Row 4 summarized successfully.
Processing row 5...
Row 5 summarized successfully.
Processing row 6...
Rate limit reached: Rate limit reached for gpt-4 in organization org-HlU6bIJXfgm1qTAnb5XjWFGy on tokens per min (TPM): Limit 10000, Used 7361, Requested 4494. Please try again in 11.13s. Visit https://platform.openai.com/account/rate-limits to learn more.
Waiting for 60 seconds before retrying...
Row 6 summarized successfully.
Processing row 7...
Row 7 summarized successfully.
Processing row 8...
Row 8 summarized successfully.
Processing row 9...
Row 9 summarized successfully.
Processing row 10...
Rate limit reached: Rate limit reached for gpt-4 in organization org-HlU6bIJXfgm1qTAnb5XjWFGy on tokens per min (TPM): Limit 10000, Used 4133, Requested 8065. Please try a

In [32]:
df.to_parquet("summarized_summaries.parquet")

In [3]:
import pandas as pd

df = pd.read_parquet("summarized_summaries.parquet")

In [20]:
df.loc[12,"title"]

"PwC India's Financial Services (FS) Risk Symposium: Ministry of Finance keynote session - February 2025"

In [17]:
len(df.loc[12,"condensed_summary"])

1450