In [None]:
%load_ext autoreload
%autoreload 2

import json
import os
import pandas as pd

from functions import gpt

from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas()

pd.set_option('display.max_colwidth', None)

In [None]:
original_file = '2021_Wightman-Posthuma_A_genomewide_association_study_with_112_563_individuals_identifies_new_risk_loci_for_Alzheimers_disease'
external_id = '2023_05_02_27142069922ab9506d3dg'
input_file = f'truth_{external_id}.chunks.csv'
output_file = f'gpt_snps_{external_id}.csv'

data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, 'diygenomics-projects', 'experiment-a', 
                                       original_file, 'mathpix', *args)

model = 'gpt-4' # 'gpt-3.5-turbo' # 'gpt-4'
index_col = 'uuid'

In [None]:
system_prompt = """You are an academic researcher in information systems biology investigating possible links between DNA, 
RNA expression, and blood biomarkers for disease, especially aging and neurodegenerative pathologies 
(Alzheimer’s disease, Parkinson’s disease). 
- Please extract any SNP (Single Nucleotide Polymorphism) identifiers mentioned along with any relevant identifiers. 
- Please format your response in JSON. You only speak JSON. Do not write text that isn't JSON.
- If you do not know the answer return {}
- At the end of your answer, rate your confidence in the answer using following the format where x is the confidence score of your output by 10: 
    Confidence: x/10
"""

In [None]:
df = pd.read_csv(file_path(input_file), index_col=index_col)

In [None]:
json_errors = {}
general_errors = {}

def extract_genes(row):  
    response = None

    if len(row['text']) > 0:
        possible_response = gpt.chat_create(system_prompt, row['text'], model, output_json=True)
        if 'json_error' in possible_response:
            json_errors[row.name] = response
        elif 'general_error' in possible_response:
            general_errors[row.name] = response
        else:
            response = possible_response
        
    return response

In [None]:
df['snps'] = df.progress_apply(lambda row: extract_genes(row), axis=1)

In [None]:
with open(file_path('gpt_snps_json_errors.json'), 'w') as file:
    json.dump(json_errors, file)
    
with open(file_path('gpt_snps_general_errors.json'), 'w') as file:
    json.dump(general_errors, file)

In [None]:
df['snps'].to_csv(file_path(output_file))