In [None]:
%load_ext autoreload
%autoreload 2

import json
import os
import pandas as pd

from functions import gpt

from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas()

pd.set_option('display.max_colwidth', None)

In [None]:
original_file = '2021_Wightman-Posthuma_A_genomewide_association_study_with_112_563_individuals_identifies_new_risk_loci_for_Alzheimers_disease'
external_id = '2023_05_02_27142069922ab9506d3dg'
input_file = f'truth_{external_id}.chunks.csv'
output_file = f'gpt_summary_{external_id}.csv'

data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, 'diygenomics-projects', 'experiment-a', 
                                       original_file, 'mathpix', *args)

model = 'gpt-4' # 'gpt-3.5-turbo' # 'gpt-4'
index_col = 'uuid'

In [None]:
df = pd.read_csv(file_path(input_file), index_col=index_col)

In [None]:
topic_system_prompt = """Extract up to 20 topics and their associated keywords (up to 5 per topic) from a research paper. 
                        Group synonyms and closely related words together. Prioritize technical information, 
                        and include only the most relevant keywords. Consider the context of the text when generating topics. Output 
                        the results as a JSON object with topics as keys with a count of occurrences of the topic in the text and a 
                        dictionary containing a list of keywords. Only provide JSON output. Do not output anything other than JSON. 
                        Check your output and make sure that it is in this format {"Speed": {"count": 2, "keywords": ['rapid development', 'technology advancement']}}"""

In [None]:
json_errors = {}
general_errors = {}

def extract_topics(row):  
    response = None

    if len(row['text']) > 0:
        possible_response = gpt.chat_create(topic_system_prompt, row['text'], model, output_json=True)
        if 'json_error' in possible_response:
            json_errors[row.name] = response
        elif 'general_error' in possible_response:
            general_errors[row.name] = response
        else:
            response = possible_response
        
    return response

df['topics'] = df.progress_apply(lambda row: extract_topics(row), axis=1)

In [None]:
with open(file_path('gpt_topic_json_errors.json'), 'w') as file:
    json.dump(json_errors, file)
    
with open(file_path('gpt_topic_general_errors.json'), 'w') as file:
    json.dump(general_errors, file)

In [None]:
combined_topics = {}
for d in list(df['topics']):
    try:
        for key, value in d.items():
            if key in combined_topics:
                combined_topics[key]['keywords'] = list(set(combined_topics[key]['keywords'] + value['keywords']))
                prior_count = combined_topics[key]['count'] if 'count' in combined_topics[key] else 0
                combined_topics[key]['count'] = prior_count + value['count'] if 'count' in value else 0
            else:
                combined_topics[key] = value
    except Exception as e:
        print(e)
        print(d)

In [None]:
with open(file_path('topics.json'), 'w') as file:
    json.dump(combined_topics, file)

In [None]:
summary_system_prompt = """Summarize the given text in a single paragraph containing no more than 8 sentences from a trascription 
                        taken from a scientific research paper."""

In [None]:
general_errors = {}

def extract_summaries(row):  
    response = None

    if len(row['text']) > 0:
        possible_response = gpt.chat_create(summary_system_prompt, row['text'], model, output_json=False)
        if 'general_error' in possible_response:
            general_errors[row.name] = response
        else:
            response = possible_response
        
    return response

df['summary'] = df.progress_apply(lambda row: extract_summaries(row), axis=1)

In [None]:
with open(file_path('gpt_summary_general_errors.json'), 'w') as file:
    json.dump(general_errors, file)

In [None]:
df['topics'].to_csv(file_path(f'gpt_topics_{external_id}.csv'))

In [None]:
df['summary'].to_csv(file_path(f'gpt_summary_{external_id}.csv'))

In [None]:
summary_of_summaries_system_prompt = """Summarize the given summaries in a single paragraph containing no more than 8 sentences 
                                    from a trascription taken from a scientific research paper."""

In [None]:
general_error = None
summary_of_summaries = None
summaries = ' '.join(str(value) for value in list(df['summary']))

possible_response = gpt.chat_create(summary_of_summaries_system_prompt, summaries, model, output_json=False)
if 'general_error' in possible_response:
    general_errors = response
else:
    summary_of_summaries = possible_response

In [None]:
with open(file_path('summary_of_summaries.txt'), 'w') as file:
    file.write(summary_of_summaries)