# Import Packages

In [74]:
import pandas as pd
from openai import OpenAI
import logging
from datetime import datetime
import re
import sys

# Add logging

In [75]:
logging.basicConfig(filename='debug.log',filemode='w',level=logging.DEBUG, format='%(asctime)s | %(levelname)s | %(message)s')

# Read Input File 

In [76]:
df = pd.read_csv('input.csv')
answers = []
current_index = 0
MODEL = sys.argv[1]

# Create Prompt

In [77]:
example = '''<probability>0.8</probability>'''

In [78]:
keyword_map = {
    0: "Pandemic, Experience, Learn, Challenge, Management, Effect, Time, Social, Change, Lesson",
    1: "Patient, Clinical, Outcome, Cancer, Hospitalized, Care, Mortality, Treatment, Severe, Characteristic",
    2: "Impact, Lockdown, Economic, Psychological, Economy, India, Market, Industry, Tourism, Global",
    3: "Health, Mental, Care, Public, Worker, Among, Social, Effect, Crisis, System",
    4: "Review, Systematic, Literature, Meta-analysis, Scoping, Treatment, Narrative, Report, Effect, Rapid",
    5: "Infection, Syndrome, Child, Respiratory, Acute, Severe, Risk, Vaccination, Report, Viral",
    6: "Disease, Severity, Cardiovascular, Infectious, Novel, Chronic, Inflammatory, Child, Outbreak, Model",
    7: "Vaccine, Vaccination, Response, Hesitancy, mRNA, Development, Variant, Antibody, Among, Safety"
}


In [79]:
PROMPT = '''Analyse the following article and determine the probability that this article aligns with a collection of papers that can be represented by a set of important keywords. Evaluate against all the given keywords and provide the reason for the alignment. The order of the keywords represent the order of their priroity. Calculate the alignment by giving higher weight to keywords with higher priority. 

*Collection*
Important keywords: {keywords}. 

*Return results*
Return the probability of the alignment between the publication and the collection. This is an example of the desirable output: 
{example}

*Article*
Title: {title} 
Description: {description}
'''

# Query GPT

In [None]:
logging.info("Starting Script")
client = OpenAI()
for index, row in df.iterrows():
    try:
        text = PROMPT.format(example = example, title = row['title'], description = row['abstract'], keywords = keyword_map[row['nmf-topic']])
        response = client.chat.completions.create(
            model=MODEL,
            messages=[{"role": "user", "content": PROMPT.format(example = example, title = row['title'], description = row['abstract'], keywords = keyword_map[row['nmf-topic']])}])

        answer = response.choices[0].message.content
        current_index +=1
        match = re.search(r'<probability>(.*?)</probability>', answer)
        print(PROMPT.format(example = example, title = row['title'], description = row['abstract'], keywords = keyword_map[row['nmf-topic']]))
        if match:
            row['gpt-probability'] = float(match.group(1))
        else:
            row['gpt-probability'] = 'n/a'
        row[MODEL] = answer
        row['prompt'] = PROMPT.format(example = example, title = row['title'], description = row['abstract'], keywords = keyword_map[row['nmf-topic']])
        answers.append(row)
        if current_index % 10 == 0:
            logging.info("Completed {}".format(current_index))
        
    except Exception as e:
        logging.error(e)
        print(e)
logging.info("Completed Script")

# Write Output to File

In [81]:
logging.info("Starting Write to file")
fdate = datetime.now().strftime('%d%m%Y%H')
output_df = pd.DataFrame(answers, columns=['doi','title','abstract','nmf-topic','nmf-topic-coorelation', 'prompt', MODEL, 'gpt-probability'])
output_df.to_csv("covid-clustering-" + MODEL + ".csv", index= False)
logging.info("Completed Write to file")