<a href="https://colab.research.google.com/github/seismosmsr/machine_learning/blob/main/summarize-chatgpt_batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# for google colab you have to install this 2 library before run the code
!pip install pypdf2
!pip install openai

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/232.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1
Collecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.

In [6]:
import openai
import os
import PyPDF2
import json
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import zipfile

In [15]:
def delete_output_files(output_directory):
    files = os.listdir(output_directory)
    for file_path in files:
        try:
            os.remove(os.path.join(output_directory, file_path))
            print(f"Successfully deleted {file_path}")
        except OSError as e:
            print(f"Error deleting {file_path}: {e}")

In [16]:
def read_pdf_and_summarize(file_title, run_id, output_directory):
    pdf_summary_text = ""
    pdf_file_path = os.path.join('/content/crop_paper_share/', file_title)

    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        for page_num in range(len(pdf_reader.pages)):
            page_text = pdf_reader.pages[page_num].extract_text().lower()
            paragraphs = [line.strip() for line in page_text.split('\n \n') if line.strip()]
            for para_num, para_text in enumerate(paragraphs):
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a helpful research assistant. Specifically, you are helping research the impact of climate change on global food systems. We are doing a literature review. We want to know what crops and food systems are being impacted by specific climate change hazards. Only respond in the form of comma separated values (csv). You always return a correctly formatted csv."},
                        {"role": "user", "content": f"Summarize the following text as a JSON document. ... (your instructions here)... Here is the text: {para_text}"}
                    ])

                page_summary = response["choices"][0]["message"]["content"]
                pdf_summary_text += page_summary + "\n"
                page_summary_file = os.path.splitext(file_title)[0] + f"_{page_num}_{para_num}_summary.json"
                with open(os.path.join(output_directory, page_summary_file), "w+") as file:
                    file.write(page_summary)

    pdf_summary_file = os.path.splitext(file_title)[0] + "_summary.txt"
    with open(os.path.join(output_directory, pdf_summary_file), "w+") as file:
        file.write(pdf_summary_text)

In [17]:
def process_summaries(output_directory, file_title, run_id):
    files = [f for f in os.listdir(output_directory) if f.endswith('.json')]
    dfs = []
    failed_files = []

    for file in files:
        try:
            with open(os.path.join(output_directory, file), 'r') as f:
                data = json.load(f)
                if isinstance(data, list) and all(isinstance(item, dict) for item in data):
                    df = pd.DataFrame(data)
                    dfs.append(df)
                elif isinstance(data, dict):
                    df = pd.DataFrame([data])
                    dfs.append(df)
                else:
                    print(f"Unhandled data structure in {file}. Skipping.")
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for {file}. Adding to failed list.")
            failed_files.append(file)

    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        combined_df['paper'] = file_title
        combined_df['run'] = run_id
        combined_df.to_csv('merged_output.csv', index=False)

        if failed_files:
            failed_df = pd.DataFrame(failed_files, columns=["Failed Filenames"])
            failed_df.to_csv('failed_files.csv', index=False)

        return combined_df
    else:
        print("No data to process.")
        return pd.DataFrame()

In [18]:
def upload_to_google_sheets(combined_df):
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]
    creds = ServiceAccountCredentials.from_json_keyfile_name('/content/precise-duality-203214-f20314634651.json', scope)
    client = gspread.authorize(creds)
    sheet = client.open_by_key("1iM2fqvMhSsf11uLoWTfAU7cpKhSM0NnLx6MB7bmG9fM").sheet1

    combined_df.fillna('', inplace=True)
    data_to_append = [[i for i in combined_df.iloc[j]] for j in range(combined_df.shape[0])]
    sheet.append_rows(data_to_append)

In [19]:
import requests

file_id = "1jP3C9kxyFhYFLff1_aCW2IoOD9-y-hTc"
destination = "publications.zip"

url = f"https://drive.google.com/uc?id={file_id}"

response = requests.get(url)
with open(destination, "wb") as f:
    f.write(response.content)

print("Download complete.")


Download complete.


In [7]:
# Extract the contents of the ZIP file
with zipfile.ZipFile('/content/publications.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [20]:
files_list = [f for f in os.listdir('/content/crop_paper_share') if f.endswith('.pdf')]
output_directory = '/content/output/'

In [22]:
# os.mkdir('/content/output/')

In [23]:
files = os.listdir('/content/output/')
for file_path in files:
    try:
        os.remove('/content/output/'+file_path)
        print(f"Successfully deleted {file_path}")
    except OSError as e:
        print(f"Error deleting {file_path}: {e}")

In [None]:
openai.api_key = "NA" #change the api key with yours

In [None]:

for file_title in files_list:
    for run_id in range(1):
        delete_output_files(output_directory)
        read_pdf_and_summarize(file_title, run_id, output_directory)
        combined_df = process_summaries(output_directory, file_title, run_id)
        if not combined_df.empty:
            upload_to_google_sheets(combined_df)