## Classification by using OpenAI embeddings

This notebook uses OpenAI embeddings to classify research papers into research objects.

In [1]:
import os

base_dir = os.path.join("..", "..")
data_dir = os.path.join(base_dir, "data", "software_architecture")
bibtext_file = os.path.join(data_dir, "bib-text.csv")
embeddings_file = os.path.join(data_dir, "bib-text_embeddings.csv")

reports = os.path.join(base_dir, "reports", "openai_clustering")
os.makedirs(reports, exist_ok=True)

evaluation_file = os.path.join(reports, "evaluation.json")
predictions_file = os.path.join(reports, "predictions.csv")

## Data preparation

This section create embeddings for paper titles and abstracts with OpenAI. The dataframe is only generated if the datafile is missing.

In [3]:
import tqdm
import openai
import numpy as np
import pandas as pd

if not os.path.exists(embeddings_file):

    #
    # Get embeddings for title and abstracts from OpenAI
    #

    df = pd.read_csv(bibtext_file)

    embeddings = []
    for inx, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        title = row["title"]
        abstract = row["abstract"]

        text = title
        if len(abstract) > 0:
            text += "\n\n" + abstract

        response = openai.Embedding.create(
            input=text,
            model="text-embedding-ada-002"
        )
        embedding = response["data"][0]["embedding"]
        embeddings.append(np.array(embedding))

    # save embeddings
    df["embedding"] = embeddings
    df.to_csv(embeddings_file, index=False)
else:
    #
    # Load existing data
    #

    df = pd.read_csv(embeddings_file)
    df["embedding"] = df.embedding.apply(eval).apply(np.array)  # convert string to array

### Create research objects classifications

This section adds the "research_objects" field to the dataframe. The Research Objects are list of strings.

In [4]:
from typing import List
from treelib import Tree
from src.taxonomy.utils import parse_taxonomy


def get_research_objects(tree: Tree) -> List[str]:
    research_obj = tree.children("Research Object")

    objs = []
    for obj in research_obj:
        objs.append(obj.tag)

    return objs

In [5]:
research_objects = []

for inx, row in df.iterrows():
    taxonomy = parse_taxonomy(row["classes"])
    research_objects.append(get_research_objects(taxonomy))

df["research_objects"] = research_objects

## Data classification

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

all_labels = set()
for labels in df.research_objects:
    all_labels.update(labels)

# Convert ground truth and predicted labels to a binary arrays
mlb = MultiLabelBinarizer(classes=list(all_labels))
df["research_objects_binary"] = list(mlb.fit_transform(df["research_objects"]))

## Train model

Train a predictor for research objects. The trainer uses KNeighborsClassifier and MultiOutputClassifier for training.

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1, random_state=19980528)

# Transform embeddings to 
train_embedding = np.array(train["embedding"].tolist())
test_embedding = np.array(test["embedding"].tolist())

train_classes = np.array(train["research_objects_binary"].tolist())
test_classes = np.array(test["research_objects_binary"].tolist())

In [8]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

# Create a multi-label classifier
estimator = KNeighborsClassifier(metric="cosine")
classifier = MultiOutputClassifier(estimator)

# Train the classifier
_ = classifier.fit(train_embedding, train_classes)

## Evaluate the model



In [9]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import json

# Make predictions on the test set
y_pred = classifier.predict(test_embedding)

accuracy = accuracy_score(test_classes, y_pred)
precision = precision_score(test_classes, y_pred, average="micro")
recall = recall_score(test_classes, y_pred, average="micro")
f1 = f1_score(test_classes, y_pred, average="micro")

report = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}

with open(evaluation_file, "w") as file:
    json.dump(report, file, indent=4)

print("Accuracy:   ", accuracy)
print("Precision:  ", precision)
print("Recall:     ", recall)
print("f1 (micro): ", f1)

Accuracy:    0.1875
Precision:   0.6
Recall:      0.17647058823529413
f1 (micro):  0.2727272727272727


In [10]:
# Create dataframe with ground truth and predicted
results = pd.DataFrame({
    "ground_truth": mlb.inverse_transform(test_classes),
    "predicted": mlb.inverse_transform(y_pred)
})
results.to_csv(predictions_file, index=False)

In [11]:
results

Unnamed: 0,ground_truth,predicted
0,"(Technical Debt,)",()
1,"(Architecture Decision Making,)",()
2,"(Reference Architecture,)",()
3,"(Architecture Design Method,)","(Architecture Design Method,)"
4,"(Reference Architecture,)",()
5,"(Architecture Optimization Method, Architectur...",()
6,"(Architecture Optimization Method,)","(Reference Architecture,)"
7,"(Architecture Analysis Method,)","(Architecture Description Language,)"
8,"(Architecture Design Method,)",()
9,"(Architecture Description Language,)",()
