# ECSA Classification with OpenAI embeddings

This notebook uses OpenAI's text embeddings to classify ECSA papers into custom predefined categories

In [None]:
import os

base_dir = os.path.join('.')
data_dir = os.path.join(base_dir, 'data')

ecsa_input = os.path.join(data_dir, 'ecsa', 'ecsa_embeddings.csv')

reports_dir = os.path.join(base_dir, 'reports', 'unsupervised-classification')
os.makedirs(reports_dir, exist_ok=True)

cluster_output = os.path.join(reports_dir, 'ecsa_custom_classes.csv')

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(ecsa_input)
df["text"] = df["title"] + " " + df["abstract"]
df["embedding"] = df.embedding.apply(eval).apply(np.array)  # convert string to array

df.head()

## Custom classes

We define the following classes that we want to classify the papers into:

In [None]:
classes = [
    "Deep Learning",
    "Reinforcement Learning",
    "Computer Vision",
    "Natural Language Processing",
    "Machine Learning",
    "Statistics",
]

## Classify papers

For each paper, we find the closest class embedding and assign the paper to that class.

In [None]:
import openai
from tqdm import tqdm

class_embeddings = []

for category in tqdm(classes):
    response = openai.Embedding.create(
        input=category,
        model="text-embedding-ada-002",
    )

    embedding = response["data"][0]["embedding"]
    class_embeddings.append(embedding)

In [None]:
# For each document, find the closest class embedding
from scipy.spatial.distance import cdist

distances = cdist(df.embedding.tolist(), class_embeddings, metric="cosine")
df["class"] = np.argmin(distances, axis=1)
df["class"] = df["class"].apply(lambda x: classes[x])

In [None]:
results = df[["doi", "title", "abstract", "class"]]
results.to_csv(cluster_output, index=False)

In [None]:
results

In [None]:
# Plot the distribution of papers per class
import matplotlib.pyplot as plt

results["class"].value_counts().plot.bar()
plt.show()