<a href="https://colab.research.google.com/github/seismosmsr/machine_learning/blob/main/summarize-chatgpt_batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
# for google colab you have to install this 2 library before run the code
!pip install pypdf2
!pip install openai



In [43]:
import openai
import os
import PyPDF2
import json
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import zipfile
import numpy as np

In [44]:
def delete_output_files(output_directory):
    files = os.listdir(output_directory)
    for file_path in files:
        try:
            os.remove(os.path.join(output_directory, file_path))
            print(f"Successfully deleted {file_path}")
        except OSError as e:
            print(f"Error deleting {file_path}: {e}")

In [182]:
def merge_texts(texts, chunk_size=1000, max_chunks=10, overlap_size=200):
    """
    Merge a list of texts into larger chunks of text and track the origin of each chunk.

    :param texts: List of texts to be merged
    :param chunk_size: Preferred size of each merged chunk
    :param max_chunks: Maximum number of chunks to be created
    :param overlap_size: Number of characters to overlap between chunks
    :return: Tuple of two lists: merged chunks and list of unique paragraph indices for each chunk
    """
    if not texts:
        return [], []

    total_length = sum(len(text) for text in texts)
    if total_length > max_chunks * (chunk_size - overlap_size):
        chunk_size = (total_length + overlap_size * (max_chunks - 1)) // max_chunks

    merged_chunks = []
    chunk_indices = []
    current_chunk = ""
    current_indices = set()
    remaining_overlap = ""
    remaining_indices = set()

    for i, text in enumerate(texts):
        words = text.split()
        for word in words:
            if len(current_chunk) + len(word) + 1 > chunk_size:
                merged_chunks.append(current_chunk.strip())
                chunk_indices.append(sorted(current_indices))
                current_chunk = remaining_overlap + " "
                current_indices = remaining_indices.copy()
                remaining_overlap = ""
                remaining_indices = set()
                if len(merged_chunks) == max_chunks - 1:
                    remaining_texts = " ".join(texts[i:])
                    merged_chunks.append(current_chunk + remaining_texts)
                    remaining_indices.add(i)
                    chunk_indices.append(sorted(current_indices.union(remaining_indices)))
                    return merged_chunks, chunk_indices
            current_chunk += word + " "
            if len(current_chunk) > chunk_size - overlap_size:
                remaining_overlap += word + " "
                remaining_indices.add(i)
            current_indices.add(i)

    if current_chunk:
        merged_chunks.append(current_chunk.strip())
        chunk_indices.append(sorted(current_indices))

    return merged_chunks, chunk_indices


In [175]:
import time
import openai

def read_pdf_and_summarize(file_title, run_id, output_directory, max_retries=3, delay=300):
    pdf_summary_text = ""
    pdf_file_path = os.path.join('/content/crop_paper_share/', file_title)

    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        for page_num in range(len(pdf_reader.pages)):
            page_text = pdf_reader.pages[page_num].extract_text().lower()
            paragraphs = [line.strip() for line in page_text.split('\n \n') if line.strip()]
            chunks, paragraphs = merge_texts(paragraphs, chunk_size=3000, overlap_size=1000)
            # chunks, paragraphs
            for para_text, para_num in zip(chunks, paragraphs):
                retries = 0
                para_num = ', '.join(map(str, para_num))
                while retries < max_retries:
                    try:
                        response = openai.ChatCompletion.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                {"role": "system", "content": "You are a helpful research assistant. Specfically, you are helping research the impact of climate change on global food systems. We are doing a literature review. We want to know what crops and food systems are being impacted by specific climate change hazards. Only respond in the form of comma seperated values (csv). You always return a correctly formatted csv."},
                                {"role": "user", "content": f"Summarize every sentence of the following text as a JSON document. Do not reply with anything except a JSON document. Please identify all food systems (crop, animal, or wild), any potential climate change hazards to food systems, what cropping or food systems (such as a fishery) they effect, where globally the impact will be experienced, the specific quote from the paragraph (of at least 100 characters), if the impact is generally positive or negative (sentiment), and approximately what magnitude (e.g. high medium low). Do not do anything that could possibly break JSON formatting. Please make sure that text entries do not use commas internal to any text entries in the table. Please only return a JSON. The elements should only be: region, cropping_system, impact, sentiment, magnitude, quote, page_number (you are working on page {page_num}), paragraph_number (you are working on paragraph numbers: {para_num}). Here is the text:{page_text}. Do not return anything but the properly formatted JSON. It is of the utmost importance that the response you give is a properly formatted JSON. If the paragraph does not contain any text about climate change hazards, return an empty JSON (the categories with the text no data as the content) "},
                                ])

                        page_summary = response["choices"][0]["message"]["content"]
                        pdf_summary_text += page_summary + "\n"

                        page_summary_file = os.path.splitext(file_title)[0] + f"_{page_num}_{para_num}_summary.json"
                        with open(os.path.join(output_directory, page_summary_file), "w+") as file:
                            file.write(page_summary)

                        break  # Break out of the retry loop since the request was successful

                    except openai.error.OpenAIError as e:
                        if "timeout" in str(e).lower():
                            print(f"Request timed out. Retrying in {delay} seconds...")
                            retries += 1
                            time.sleep(delay)
                        else:
                            print("An error occurred:", str(e))
                            break  # Break out of the retry loop for non-timeout errors
                else:
                    print("Max retries reached. Moving to the next paragraph.")

    pdf_summary_file = os.path.splitext(file_title)[0] + "_summary.txt"
    with open(os.path.join(output_directory, pdf_summary_file), "w+") as file:
        file.write(pdf_summary_text)


In [17]:
def process_summaries(output_directory, file_title, run_id):
    files = [f for f in os.listdir(output_directory) if f.endswith('.json')]
    dfs = []
    failed_files = []

    for file in files:
        try:
            with open(os.path.join(output_directory, file), 'r') as f:
                data = json.load(f)
                if isinstance(data, list) and all(isinstance(item, dict) for item in data):
                    df = pd.DataFrame(data)
                    dfs.append(df)
                elif isinstance(data, dict):
                    df = pd.DataFrame([data])
                    dfs.append(df)
                else:
                    print(f"Unhandled data structure in {file}. Skipping.")
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for {file}. Adding to failed list.")
            failed_files.append(file)

    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        combined_df['paper'] = file_title
        combined_df['run'] = run_id
        combined_df.to_csv('merged_output.csv', index=False)

        if failed_files:
            failed_df = pd.DataFrame(failed_files, columns=["Failed Filenames"])
            failed_df.to_csv('failed_files.csv', index=False)

        return combined_df
    else:
        print("No data to process.")
        return pd.DataFrame()

In [34]:
def upload_to_google_sheets(combined_df):
    # Convert non-serializable types (like np.int64 and np.float64) to native Python types
    converted_df = combined_df.applymap(lambda x: int(x) if isinstance(x, np.integer) else (float(x) if isinstance(x, np.floating) else x))

    # Convert all values to string to ensure compatibility
    str_df = converted_df.astype(str)

    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]
    creds = ServiceAccountCredentials.from_json_keyfile_name('/content/precise-duality-203214-f20314634651.json', scope)
    client = gspread.authorize(creds)
    sheet = client.open_by_key("1iM2fqvMhSsf11uLoWTfAU7cpKhSM0NnLx6MB7bmG9fM").sheet1

    data_to_append = str_df.values.tolist()
    sheet.append_rows(data_to_append)

In [19]:
import requests

file_id = "1jP3C9kxyFhYFLff1_aCW2IoOD9-y-hTc"
destination = "publications.zip"

url = f"https://drive.google.com/uc?id={file_id}"

response = requests.get(url)
with open(destination, "wb") as f:
    f.write(response.content)

print("Download complete.")


Download complete.


In [7]:
# Extract the contents of the ZIP file
with zipfile.ZipFile('/content/publications.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [20]:
files_list = [f for f in os.listdir('/content/crop_paper_share') if f.endswith('.pdf')]
output_directory = '/content/output/'

In [22]:
# os.mkdir('/content/output/')

In [23]:
files = os.listdir('/content/output/')
for file_path in files:
    try:
        os.remove('/content/output/'+file_path)
        print(f"Successfully deleted {file_path}")
    except OSError as e:
        print(f"Error deleting {file_path}: {e}")

In [None]:
# openai.api_key = "NA" #change the api key with yours

In [35]:
if not combined_df.empty:
    upload_to_google_sheets(combined_df)

In [177]:
if not combined_df.empty:
    upload_to_google_sheets(combined_df)

In [183]:
for run_id in range(10):
  for file_title in [files_list[8]]:
    delete_output_files(output_directory)
    read_pdf_and_summarize(file_title, run_id, output_directory)
    combined_df = process_summaries(output_directory, file_title, run_id)
    if not combined_df.empty:
        upload_to_google_sheets(combined_df)

Successfully deleted LaurusNobilis_SCI_Q4_researchgate_6_0_summary.json
Successfully deleted LaurusNobilis_SCI_Q4_researchgate_summary.txt
Successfully deleted LaurusNobilis_SCI_Q4_researchgate_4_0_summary.json
Successfully deleted LaurusNobilis_SCI_Q4_researchgate_0_0_summary.json
Successfully deleted LaurusNobilis_SCI_Q4_researchgate_1_0_summary.json
Successfully deleted LaurusNobilis_SCI_Q4_researchgate_5_0_summary.json
Successfully deleted LaurusNobilis_SCI_Q4_researchgate_2_0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37_summary.json
Successfully deleted LaurusNobilis_SCI_Q4_researchgate_3_0_summary.json
Failed to decode JSON for LaurusNobilis_SCI_Q4_researchgate_1_0_summary.json. Adding to failed list.
Failed to decode JSON for LaurusNobilis_SCI_Q4_researchgate_3_0_summary.json. Adding to failed list.
Successfully deleted LaurusNobilis_SCI_Q4_researchgate_6_0_summary.json
Successfully del