# Use ChatGPT for classification

This notebook uses [OpenAI's Chat API](https://beta.openai.com/docs/api-reference/chat) to classify papers into research objects.

In [3]:
import os

base_dir = os.path.join('..', '..')
data_dir = os.path.join(base_dir, 'data', 'software_architecture')

bibtext_file = os.path.join(data_dir, 'bib-text.csv')
taxonomy_explanation_file = os.path.join(data_dir, 'taxonomy_explanation.json')

reports = os.path.join(base_dir, 'reports', 'openai_chatgpt')
os.makedirs(reports, exist_ok=True)

results_file = os.path.join(reports, 'chatgpt_classification.csv')
evaluation_file = os.path.join(reports, 'chatgpt_evaluation.csv')

In [4]:
import pandas as pd

df = pd.read_csv(bibtext_file)

In [5]:
from typing import List
from treelib import Tree
from src.taxonomy.utils import parse_taxonomy


def get_research_objects(tree: Tree) -> List[str]:
    research_obj = tree.children("Research Object")

    objs = []
    for obj in research_obj:
        objs.append(obj.tag)

    return objs

### Build taxonomy explanation for prompt design

This section creates taxonomy explanations for [zero-shot training](https://www.promptingguide.ai/techniques/zeroshot)

In [6]:
from src.utils.utils_json import read_json

taxonomy_explanation = read_json(taxonomy_explanation_file)
explanation_prompt = ""

research_obj_explanation = taxonomy_explanation["Research Object"]
for key in research_obj_explanation:
    explanation_prompt += f"- {key}: {research_obj_explanation[key]}\n"

### Create prompts for openai fine-tuning

* **Prompts** are created by concatenating the title and abstract of a paper.
* **Completions** are JSON objects that contain the research fields and explanations.

In [8]:
import openai
import json


def classify_paper(title: str, abstract: str, temperature: float) -> list[str]:
    paper = f"{title}\n\n{abstract}"

    # OpenAI chat API
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        temperature=temperature,
        messages=[
            {
                "role": "user",
                "content": f"Paper: {paper}"
            },
            {
                "role": "user",
                "content": "Classify the paper. Only respond with a JSON array that contains a list of research objects. The list should contain only strings. The research objects can be following: \n\n" + explanation_prompt + "\n\n If the paper doesn't match any respond with an empty array."
            },
        ]
    )

    return json.loads(response.choices[0].message.content)

In [14]:
temperatures = [0.0, 0.25, 0.5, 0.75, 1.0]

In [9]:
from tqdm import tqdm

openai.api_key = os.environ["OPENAI_API_KEY"]

results = []

for inx, row in tqdm(df.iterrows(), total=len(df)):
    for temperature in temperatures:
        taxonomy = parse_taxonomy(row['classes'])
        gt_classification = get_research_objects(taxonomy)

        title, abstract = row["title"], row["abstract"]
        prediction = classify_paper(title, abstract, temperature)
        results.append({
            "doi": row["doi"],
            "title": title,
            "abstract": abstract,
            "research_objects": gt_classification,
            "temperature": temperature,
            "predictions": prediction
        })

100%|██████████| 153/153 [10:03<00:00,  3.95s/it]


In [10]:
# Create a dataframe that stores the prediction results
results_df = pd.DataFrame(results)

# Write prediction results
results_df.to_csv(results_file, index=False)

## Evaluate prediction results

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

all_classes = set()
for labels in results_df['research_objects']:
    all_classes.update(labels)
for labels in results_df['predictions']:
    all_classes.update(labels)

# Convert ground truth and predicted labels to binary arrays
mlb = MultiLabelBinarizer(classes=list(all_classes))

evaluation_results = []

for temperature in temperatures:
    temp_df = results_df[results_df['temperature'] == temperature]
    ground_truth_binary = mlb.fit_transform(temp_df['research_objects'])
    predicted_binary = mlb.transform(temp_df['predictions'])

    # Calculate true positives, true negatives, false positives, false negatives
    true_positives = (predicted_binary * ground_truth_binary).sum(axis=0)
    true_negatives = ((1 - predicted_binary) * (1 - ground_truth_binary)).sum(axis=0)
    false_positives = (predicted_binary * (1 - ground_truth_binary)).sum(axis=0)
    false_negatives = ((1 - predicted_binary) * ground_truth_binary).sum(axis=0)

    # Calculate accuracy score
    accuracy = accuracy_score(ground_truth_binary, predicted_binary)
    precision = precision_score(ground_truth_binary, predicted_binary, average='micro')
    recall = recall_score(ground_truth_binary, predicted_binary, average='micro')
    f1 = f1_score(ground_truth_binary, predicted_binary, average='micro')

    evaluation_results.append({
        "temperature": temperature,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

In [20]:
# Write evaluation results
evaluation_df = pd.DataFrame(evaluation_results)
evaluation_df.to_csv(evaluation_file, index=False)
evaluation_df

Unnamed: 0,temperature,accuracy,precision,recall,f1
0,0.0,0.222222,0.272,0.596491,0.373626
1,0.25,0.20915,0.26893,0.602339,0.371841
2,0.5,0.222222,0.269341,0.549708,0.361538
3,0.75,0.20915,0.281646,0.520468,0.365503
4,1.0,0.202614,0.255072,0.51462,0.341085
