### Set up and activate virtual environoment and notebook kernel, and install packages necessary for the project

In [1]:
# # Create a virtual environment
# python3 -m venv venv_paper_visualizer

# # Activate the virtual environment
# source venv_paper_visualizer/bin/activate

# # Install packages from requirements.txt
# pip install -r requirements.txt

# # Add the virtual environment as a Jupyter kernel
# python -m ipykernel install --user --name=venv_paper_visualizer --display-name "Python (venv_paper_visualizer)"

### Using SemanticScholar API endpoint, retrieve all papers from a year range about a certain topic
### The data will be stored in a .jsonl file

In [2]:
import requests
import json

query = "large language model" 
# query = "computational neuroscience"
# first run = 13277
# second run = 14144
# 12.26.2023 = 14433
# 12.29.2023 = 14624 
# 12.30.2023 = 14654
# 01.01.2024 = 14704
# 01.02.2024 = 14773 ??why are 2023 papers still being added??
# 01.03.2023 1:45PM = 14793
# 01.11.2024 1:01AM = 14862
# 01.12.2024 5:29PM = 14882
fields = "paperId,publicationDate,isOpenAccess,openAccessPdf,title,referenceCount,citationCount,influentialCitationCount,abstract,authors"
#field_of_study = "Computer Science" # you can filter the category of paper to retrieve
min_citation_count=0
years = "2023-2023"
url = f"http://api.semanticscholar.org/graph/v1/paper/search/bulk?query={query}&fields={fields}&year={years}&minCitationCount={min_citation_count}"
retrieved = 0

try:
    r = requests.get(url).json()
    # Check if 'total' key exists in the response
    if 'total' in r:
        print(f"Will retrieve an estimated {r['total']} documents")
    else:
        print("The 'total' key is not in the response.")
except requests.RequestException as e:
    print(f"An error occurred while making the request: {e}")
except KeyError as e:
    print(f"A key error occurred: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

with open(f"papers.jsonl", "w") as file:
    while True:
        if "data" in r:
            retrieved += len(r["data"])
            print(f"Retrieved {retrieved} papers...")
            for paper in r["data"]:
                print(json.dumps(paper), file=file)
        if "token" not in r:
            break
        r = requests.get(f"{url}&token={r['token']}").json()

print(f"Done! Retrieved {retrieved} papers total")
print(f"Created papers.jsonl with LLM paper data")

Will retrieve an estimated 14882 documents
Retrieved 1000 papers...
Retrieved 2000 papers...
Retrieved 3000 papers...
Retrieved 4000 papers...
Retrieved 5000 papers...
Retrieved 6000 papers...
Retrieved 7000 papers...
Retrieved 8000 papers...
Retrieved 9000 papers...
Retrieved 10000 papers...
Retrieved 11000 papers...
Retrieved 12000 papers...
Retrieved 13000 papers...
Retrieved 14000 papers...
Retrieved 14882 papers...
Done! Retrieved 14882 papers total
Created papers.jsonl with LLM paper data


### You can check how many papers from the semantic shcolar query are actually unique - they all should be...

In [3]:
unique_paper_ids = set()

with open('papers.jsonl', 'r') as file:
    for line in file:
        paper = json.loads(line)
        paper_id = paper.get('paperId')
        if paper_id:
            unique_paper_ids.add(paper_id)

print(f"Number of unique paper IDs: {len(unique_paper_ids)}")

Number of unique paper IDs: 14882


### We want incldue Semantic Scholar's AI generated tldr ("too long didn't read" summary) attribute of each paper. Since the bulk papers endpoint used previously doesn't allow us to retreive it, we must use a differnt semantic scholar api endpoint that does.

In [4]:
# Read all papers into memory
papers = []
paper_ids = []
with open('papers.jsonl', 'r') as file:
    for line in file:
        paper = json.loads(line)
        papers.append(paper)
        paper_id = paper.get('paperId')
        if paper_id:
            paper_ids.append(paper_id)

tldr_count = 0

# Fetch TLDRs in batches of 500, the maximum allowed for this endpoint
for i in range(0, len(paper_ids), 500):
    batch_ids = paper_ids[i:i+500]
    response = requests.post("https://api.semanticscholar.org/graph/v1/paper/batch", 
                             params={'fields': 'tldr'}, 
                             json={"ids": batch_ids})
    
    if response.status_code == 200:
        data = response.json()
        tldrs = {paper['paperId']: paper.get('tldr', 'No TLDR found') for paper in data}
    else:
        print(f"Failed to fetch TLDRs: {response.status_code}")
        tldrs = {paper_id: 'No TLDR found' for paper_id in batch_ids}

    for j in range(i, min(i + 500, len(papers))):
        papers[j]['tldr'] = tldrs.get(papers[j]['paperId'], 'No TLDR found')
        tldr_count += 1

        # Print update every 1000 TLDRs
        if tldr_count % 1000 == 0:
            print(f"Added {tldr_count} TLDRs so far.")

with open('papers.jsonl', 'w') as file:
    for paper in papers:
        file.write(json.dumps(paper) + '\n')

print(f"Done! Retrieved {tldr_count} TLDRs total.")
print(f"Updated papers with TLDRs in papers.jsonl")

Added 1000 TLDRs so far.
Added 2000 TLDRs so far.
Added 3000 TLDRs so far.
Added 4000 TLDRs so far.
Added 5000 TLDRs so far.
Added 6000 TLDRs so far.
Added 7000 TLDRs so far.
Added 8000 TLDRs so far.
Added 9000 TLDRs so far.
Added 10000 TLDRs so far.
Added 11000 TLDRs so far.
Added 12000 TLDRs so far.
Added 13000 TLDRs so far.
Added 14000 TLDRs so far.
Done! Retrieved 14882 TLDRs total.
Updated papers with TLDRs in papers.jsonl


### Convert the .jsonl to a .csv

In [5]:
import csv

with open('papers.jsonl', 'r') as jsonl_file, open('papers.csv', 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    # Write the header in the specified order
    csv_writer.writerow(['paperId', 'title', 'publicationDate', 'openAccessPdf', 'referenceCount', 'citationCount', 'influentialCitationCount', 'authorNames', 'authorIds', 'abstract', 'tldr'])

    for line in jsonl_file:
        paper = json.loads(line)

        # extract just the url portion
        open_access_pdf = paper.get('openAccessPdf', {})
        open_access_pdf_url = open_access_pdf.get('url', 'open access PDF is not available') if isinstance(open_access_pdf, dict) else 'open access PDF is not available'

        # Extract author names and IDs
        author_names = ', '.join([author['name'] for author in paper.get('authors', []) if author.get('name')])
        author_ids = ', '.join([str(author['authorId']) for author in paper.get('authors', []) if author.get('authorId')])

        # Handle TLDR text
        tldr = paper.get('tldr')
        tldr_text = tldr.get('text', '') if isinstance(tldr, dict) else ''

        # Write the row in the specified order
        csv_writer.writerow([
            paper.get('paperId', ''),
            paper.get('title', ''),
            paper.get('publicationDate', ''),
            open_access_pdf_url,
            paper.get('referenceCount', ''),
            paper.get('citationCount', ''),
            paper.get('influentialCitationCount', ''),
            author_names,
            author_ids,
            paper.get('abstract', ''),
            tldr_text
        ])

print("converted papers.jsonl into papers.csv")

converted papers.jsonl into papers.csv


### Check if the data is messed up and decide how to clean it

In [6]:
import pandas as pd
import os

df = pd.read_csv('papers.csv')

missing_abstract_and_tldr = df[pd.isna(df['abstract']) & pd.isna(df['tldr'])][['paperId', 'publicationDate', 'influentialCitationCount', 'title', 'tldr', 'abstract']].set_index('paperId').T.to_dict()

abstract_na_tldr_questionable = df[pd.isna(df['abstract']) & (df['tldr'].str.len() < 150)][['paperId', 'publicationDate', 'influentialCitationCount', 'title', 'tldr', 'abstract']].set_index('paperId').T.to_dict()

abstract_questionable = df[(df['abstract'].str.len() < 300)][['paperId', 'publicationDate', 'influentialCitationCount', 'title', 'tldr', 'abstract']].set_index('paperId').T.to_dict()

tldr_questionable = df[(df['tldr'].str.len() < 100)][['paperId', 'publicationDate', 'influentialCitationCount', 'title', 'tldr', 'abstract']].set_index('paperId').T.to_dict()

# Create the directory if it does not exist
directory = 'questionable_data'
if not os.path.exists(directory):
    os.makedirs(directory)

with open('questionable_data/missing_abstract_and_tldr.txt', 'w') as file:
    for paper_id, info in missing_abstract_and_tldr.items():
        file.write(f"paperId: {paper_id}\n")
        file.write(f"publicationDate: {info['publicationDate']}\n")
        file.write(f"influentialCitationCount: {info['influentialCitationCount']}\n")
        file.write(f"title: {info['title']}\n")
        file.write(f"tldr: {info['tldr']}\n")
        file.write(f"abstract: {info['abstract']}\n\n")

with open('questionable_data/abstract_na_tldr_questionable.txt', 'w') as file:
    for paper_id, info in abstract_na_tldr_questionable.items():
        file.write(f"paperId: {paper_id}\n")
        file.write(f"publicationDate: {info['publicationDate']}\n")
        file.write(f"influentialCitationCount: {info['influentialCitationCount']}\n")
        file.write(f"title: {info['title']}\n")
        file.write(f"tldr: {info['tldr']}\n")
        file.write(f"abstract: {info['abstract']}\n\n")

with open('questionable_data/abstract_questionable.txt', 'w') as file:
    for paper_id, info in abstract_questionable.items():
        file.write(f"paperId: {paper_id}\n")
        file.write(f"publicationDate: {info['publicationDate']}\n")
        file.write(f"influentialCitationCount: {info['influentialCitationCount']}\n")
        file.write(f"title: {info['title']}\n")
        file.write(f"tldr: {info['tldr']}\n")
        file.write(f"abstract: {info['abstract']}\n\n")

with open('questionable_data/tldr_questionable.txt', 'w') as file:
    for paper_id, info in tldr_questionable.items():
        file.write(f"paperId: {paper_id}\n")
        file.write(f"publicationDate: {info['publicationDate']}\n")
        file.write(f"influentialCitationCount: {info['influentialCitationCount']}\n")
        file.write(f"title: {info['title']}\n")
        file.write(f"tldr: {info['tldr']}\n")
        file.write(f"abstract: {info['abstract']}\n\n")

In [7]:
len(abstract_questionable)

78

### Do final data preprocessing. The goal is to make sure every paper has a sufficiently long abstract that will be the basis for clustering. It's also important for the tldr to be present because that is what will be eventually displayed in RShiny plotly visualization.

In [8]:
# 1. Drop rows where both tldr and abstract is NA value
df = pd.read_csv('papers.csv')
initial_row_count = df.shape[0]
print(f"Papers to start with: {initial_row_count}\n")
df = df.dropna(subset=['abstract', 'tldr'], how='all')
final_row_count = df.shape[0]
rows_dropped = initial_row_count - final_row_count
print(f"Number of papers where both tldr and abstract are NA (dropped): {rows_dropped}")
df.to_csv('papers.csv', index=False)

# 2. If abstract is NA and tldr is fucked, drop that row --> judge this from "abstract_na_tldr_questionable.txt"
# ... the observations in this file look good

# 3. If abstract is questionable (abnormally short), impute abstract with the title concatenated with tldr and with abstract
# Note that some of these observations should be straight up dropped, be we not gonna spend time manually inpecting that shit
df = pd.read_csv('papers.csv')
imputed_rows = 0
for index, row in df.iterrows():
    if not pd.isna(row['abstract']) and len(row['abstract']) < 300:
        tldr = '' if pd.isna(row['tldr']) else row['tldr']
        df.at[index, 'abstract'] = row['title'] + ". " + tldr + ". " + row['abstract']
        imputed_rows += 1
print(f"Number of rows with questionable abstracts imputed with title + tldr + abstract: {imputed_rows}")
df.to_csv('papers.csv', index=False)

# 4. If abstract is NA and tldr is ok, impute abstract with the tldr concatenated with title
# !! note that this step has to come after the last one, other wise abstract will be double concatenated
df = pd.read_csv('papers.csv')
na_abstract_count = df['abstract'].isna().sum()
df.loc[df['abstract'].isna(), 'abstract'] = df['title'] + ". " + df['tldr']
print(f"Number of rows with missing abstracts imputed with title + tldr: {na_abstract_count}")
df.to_csv('papers.csv', index=False)

# 5. If abstract is ok (after steps 1-4 it should be) and tldr is NA, impute tldr with abstract
df = pd.read_csv('papers.csv')
na_tldr_count = df['tldr'].isna().sum()
df.loc[df['tldr'].isna(), 'tldr'] = df['abstract']
print(f"Number of rows with tldr imputed: {na_tldr_count}")
df.to_csv('papers.csv', index=False)

# 6. Drop rows where publicationDate is NA - it doesn't make sense to keep these rows since the
# RShiny app has a plot of paper topic trends over time. Also, these extra ~1500 observations slow down
# RShiny app performance. However, this leaves many good papers on the table, so use discretion.
# df = pd.read_csv('papers.csv')
# initial_row_count = df.shape[0]
# df = df.dropna(subset=['publicationDate'])
# final_row_count = df.shape[0]
# rows_dropped = initial_row_count - final_row_count
# print(f"Number of rows dropped where publicationDate was NA (dropped): {rows_dropped}")
# df.to_csv('papers.csv', index=False)

# Print final paper count
print(f"\nPapers left: {df.shape[0]}")

Papers to start with: 14882

Number of papers where both tldr and abstract are NA (dropped): 487
Number of rows with questionable abstracts imputed with title + tldr + abstract: 78
Number of rows with missing abstracts imputed with title + tldr: 530
Number of rows with tldr imputed: 827

Papers left: 14395


### Now it's time for BertTOPIC. The first step is to create vector embeddings of the abstract of each article and provide it as an "embedding_model" to the BertTOPIC processes that will follow. This model contains both the vector representations of the abstracts along with the text of the abstract.
### I found that using the abstract instead of the tldr produces more accurate clusters, a result that is somewhat surprising given the extra noise present in abstracts compared to the concise tldrs.
### The code block below is CPU intensive - ONLY RUN ONCE for a particular Semantic Scholar search query. Once the .json embeddings file is created, you can run the block below the next mardown cell.

In [11]:
from sentence_transformers import SentenceTransformer
import pandas as pd

df = pd.read_csv('papers.csv')
abstracts = df['abstract'].tolist()
paper_ids = df['paperId'].tolist()

embedding_model = SentenceTransformer("BAAI/bge-small-en")
abstract_embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

##### It takes 30 minutes to generate abstract embeddings for 13k abstracts on my 16 GB macbook pro.
##### It is useful to save these to a file in case you need them later.

# map paperId to its abstract embedding
paperId_to_embedding = dict(zip(paper_ids, abstract_embeddings))
# Convert numpy arrays in the dictionary to lists for JSON serialization
for paper_id in paperId_to_embedding:
    paperId_to_embedding[paper_id] = paperId_to_embedding[paper_id].tolist()
# Save the dictionary as a JSON file
with open('abstract_embeddings.json', 'w') as file:
    json.dump(paperId_to_embedding, file)

Batches: 100%|██████████| 405/405 [27:53<00:00,  4.13s/it] 


### Run these jawns to quickly update abstract_embeddings from an exisiting embeddings file in case your papers.csv file had a paper update

In [9]:
import pandas as pd
import json

df = pd.read_csv('papers.csv')

# Ensure paperId is treated as a string and strip any whitespace
df['paperId'] = df['paperId'].astype(str).str.strip()
paper_ids_csv = set(df['paperId'])

# Load the JSON file
with open('abstract_embeddings.json', 'r') as file:
    existing_embeddings = json.load(file)

# Ensure keys are treated as strings
paper_ids_json = set(map(str, existing_embeddings.keys()))

# Check if the sets are exactly the same
if paper_ids_csv == paper_ids_json:
    print("The paperId sets from CSV and JSON are identical.")
else:
    print("There are differences between the paperId sets.")

    # Find and display differences
    diff_csv_not_json = paper_ids_csv - paper_ids_json
    diff_json_not_csv = paper_ids_json - paper_ids_csv

    print(f"IDs in CSV but not in JSON: {len(diff_csv_not_json)}")
    print(f"IDs in JSON but not in CSV: {len(diff_json_not_csv)}")

    # Print some sample differences for inspection
    print("Sample IDs in CSV but not in JSON:", list(diff_csv_not_json)[:10])
    print("Sample IDs in JSON but not in CSV:", list(diff_json_not_csv)[:10])


There are differences between the paperId sets.
IDs in CSV but not in JSON: 166
IDs in JSON but not in CSV: 53
Sample IDs in CSV but not in JSON: ['3f2c6aa0b347f10c02ee6b5e81d857f003bf3e43', '446c951de45439c3e53903916bf1b85c0475109e', 'ac47bd3b512301371fc87c68416befce6589912e', '80e642da57d4c7d4d36a770a810268e37556de27', '071404ea62a7e32d93075a570579e438596a970b', '5650b7243d27a91fe10c68bc84c80197cddaa8dc', 'a6868c28bfb842930e31216441c05526a2c6f752', 'bcaaf83946ffe764e3405799fae739582082bc6f', 'b411c8f98865565f54642af1a5c010bde6beaedc', '13eacc692aeb58c7987c535c439eeb345076bea2']
Sample IDs in JSON but not in CSV: ['59b9f84dccf23c8262b09db62e0b0a57c864b02b', '71d206e927017ff4a5de71b7bd1b0fa02a23ad9f', '6483a6f2038cd8583ad5b6678602bc904459a7f7', '9ff92d31babb7bdaecf7220b0a81c701230d8b95', 'd5c2947cab82c44e3cca8e90486da10a81e1f697', 'cc7658bd47cac361abad0b42b71a69b35ab6a23a', '0a29c1be5a2ecdf57f4615d7fe0201b176706439', '655eeac08f2c81f4e62ce72322a31af9ee11a9c2', '65e1efa56dea7067d0631304

In [10]:
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
import numpy as np

### Step 1: Load existing embeddings
with open('abstract_embeddings.json', 'r') as file:
    existing_embeddings = json.load(file)
# Convert them back to numpy arrays
for paper_id in existing_embeddings:
    existing_embeddings[paper_id] = np.array(existing_embeddings[paper_id])

### Step 2: Load the DataFrame and identify new papers
df = pd.read_csv('papers.csv')
paper_ids = df['paperId'].tolist()
abstracts = df['abstract'].tolist()
new_paper_ids = [pid for pid in paper_ids if pid not in existing_embeddings]
print(f"Number of new papers: {len(new_paper_ids)}")

# Step 3: Generate embeddings for new papers if there are any
if new_paper_ids:
    new_abstracts = df[df['paperId'].isin(new_paper_ids)]['abstract'].tolist()
    
    embedding_model = SentenceTransformer("BAAI/bge-small-en")
    new_embeddings = embedding_model.encode(new_abstracts, show_progress_bar=True)

    # Update existing_embeddings with new embeddings
    for pid, emb in zip(new_paper_ids, new_embeddings):
        existing_embeddings[pid] = emb

    # Save the updated embeddings
    with open('abstract_embeddings.json', 'w') as file:
        json.dump({pid: emb.tolist() for pid, emb in existing_embeddings.items()}, file)
    
    print(f"Appended {len(new_paper_ids)} new embeddings.")

# Step 4: Create a numpy array of embeddings in the order of paperIds in papers.csv
abstract_embeddings = np.array([existing_embeddings[pid] for pid in paper_ids], dtype=np.float32)

  from .autonotebook import tqdm as notebook_tqdm


Number of new papers: 166


Batches: 100%|██████████| 6/6 [00:25<00:00,  4.22s/it]


Appended 166 new embeddings.


### Define the clustering and dimensionality reduction models used in bertTOPIC

In [11]:
from umap import UMAP
from hdbscan import HDBSCAN

# technique that helps in reducing the high-dimensional embeddings (generated from the text data)
# into a lower-dimensional space. This step is crucial because it makes the clustering step that
# follows more manageable and can improve the quality of the clusters found by the clustering algorithm.
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=12345)

# clustering algorithm that identifies clusters of points in the reduced dimensionality space.
# It works well with how UMAP structures the data and is capable of finding clusters of varying densities.
# I've found that a cluster size of .003 the size of the number of data points works best for creating more
# evenly distributed categories.
hdbscan_model = HDBSCAN(min_cluster_size=int(len(abstracts) * .003), metric='euclidean', cluster_selection_method='eom', prediction_data=True)

### Define the representation models used for bertTOPIC. These are what actually make sense of the clusters and label them with words or phrases. There are many options for these. You can even use the OpenAI bertTOPIC integration to create a summary of your liking. 

In [13]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI
import openai
import tiktoken
import os
from dotenv import load_dotenv

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')

# models that make "hard to read" labels for created topics
keybert = KeyBERTInspired()
mmr = MaximalMarginalRelevance(diversity=0.3)

# openAI model that makes "easy to read" labels for created topics
client = openai.OpenAI(api_key=openai_key)
tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")
label_prompt = """
# CONTEXT #
I performed clustering analysis on 10000 academic papers about large language models. The clustering 
algorithm found 50 clusters in which to group these papers. The algorithm returned information about the
clusters as a set of keywords that represent the cluster, along with a set of 4 abstracts that are at the
"center" of the cluster

# OBJECTIVE #
I need to generate a short label that will be fed into a dashboard to represent the cluster. The label needs to 
capture the essense of the cluster. All I have to understand what the cluster is about is these keywords and
abstracts that the clustering algorithm returned: ////[KEYWORDS] [DOCUMENTS]////

# OUTPUT # 
Just the short topic label and no other output whatsoever. Use abberviations on for things like "large language
models" or "artificial intelligence", but don't abreviate other terms. The first word in the label cannot be "LLM".
"""
gpt_label = OpenAI(
    client,
    prompt=label_prompt,
    diversity=0.3,
    model="gpt-4-1106-preview", 
    delay_in_seconds=2, 
    chat=True,
    nr_docs=4,
    doc_length=800,
    tokenizer=tokenizer
)

# Optional - create a summary for each cluster
# summary_prompt = """
# I have a topic that is described by the following keywords: [KEYWORDS]
# The following are some documents that most closely represent the topic: 
# [DOCUMENTS]

# Based on the information above, capture the essense of these 4 documents as a short summary no longer than 80 words.
# You must adhere to this word limit. Also, you must provide output in the following format: topic: <topic summary>
# You MUST make sure not to overly rely on information from any one document.
# """
# gpt_summary = OpenAI(
#     client,
#     prompt=summary_prompt,
#     diversity=0.3,
#     model="gpt-3.5-turbo", 
#     delay_in_seconds=2, 
#     chat=True,
#     nr_docs=4,
#     doc_length=800,
#     tokenizer=tokenizer
# )


# I will use 3 difference representation models
representation_model = {
    "GPTLabel": gpt_label,
    #"GPTSummary": gpt_summary,
    "KeyBERT": keybert,
    "MMR": mmr
}

### Perform the clustering and labeling with BERTopic using all the models we just defined

In [14]:
from bertopic import BERTopic

# Initialize BERTopic with the sub-models
topic_model = BERTopic(
                        # Sub-models
                        embedding_model=embedding_model,
                        umap_model=umap_model,
                        hdbscan_model=hdbscan_model,
                        representation_model=representation_model,
                        # Hyperparameters
                        top_n_words=10,
                        verbose=True)

# Train the BERTopic model
topics, probs = topic_model.fit_transform(abstracts, abstract_embeddings)

2024-01-12 17:35:30,412 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2024-01-12 17:36:01,396 - BERTopic - Dimensionality - Completed ✓
2024-01-12 17:36:01,397 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-12 17:36:02,145 - BERTopic - Cluster - Completed ✓
2024-01-12 17:36:02,149 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/46 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 46/46 [02:33<00:00,  3.34s/it]
2024-01-12 17:39:21,153 - BERTopic - Representation - Completed ✓


### Output the GPT generated labels to a .txt file to see if they look acceptable

In [15]:
df = topic_model.get_topic_info()
gpt_labels = df['GPTLabel']
with open('gpt_labels.txt', 'w') as file:
    for label in gpt_labels:
        file.write(str(label) + '\n')

In [16]:
topic_model.get_topic_info()[:5]

Unnamed: 0,Topic,Count,Name,Representation,GPTLabel,KeyBERT,MMR,Representative_Docs
0,-1,3952,-1_the_of_and_to,"[the, of, and, to, in, we, that, for, language...",[NLP Applications & Challenges of LLMs],"[nlp, ai, language, model, models, chatgpt, ll...","[the, of, and, to, in, we, that, for, language...",[This paper presents a comprehensive and pract...
1,0,1756,0_and_medical_of_the,"[and, medical, of, the, in, clinical, to, were...",[ChatGPT's Role in Medical Education and Clini...,"[chatgpt, medicine, research, nlp, medical, in...","[and, medical, of, the, in, clinical, to, were...","[1 N o w a d a y s , t h e c o n c e p t o f a..."
2,1,1480,1_image_visual_the_to,"[image, visual, the, to, vision, and, video, w...",[Multimodal LLM Integration & Applications],"[multimodal, model, models, captioning, langua...","[image, visual, the, to, vision, and, video, w...",[The exponential growth of large language mode...
3,2,532,2_students_ai_and_the,"[students, ai, and, the, of, chatgpt, educatio...",[AI Chatbots in Education],"[chatgpt, ai, research, researchers, learning,...","[students, ai, and, the, of, chatgpt, educatio...",[The fear of whether artificial intelligence (...
4,3,511,3_translation_languages_the_language,"[translation, languages, the, language, of, mo...",[Machine Translation Enhancement Techniques],"[translation, nlp, multilingual, language, mon...","[translation, languages, the, language, of, mo...","[. In recent times, our research has focused o..."


### Add each paper's assigned category into our papers.csv file

In [17]:
df = topic_model.get_topic_info()
topic_dict = {row['Topic']: 'Unassigned Papers' if row['Topic'] == -1 else row['GPTLabel'] for index, row in df.iterrows()}
for key in topic_dict:
    if isinstance(topic_dict[key], list):
        topic_dict[key] = topic_dict[key][0] if topic_dict[key] else 'No Label'


df = pd.read_csv('papers.csv')
df['topicNumber'] = topics
df['topicLabel'] = df['topicNumber'].map(topic_dict)

df.to_csv('papers.csv', index=False)


### Create reduced vector embeddings of the paper abstracts for RShiny visualization purposes, and save them to our papers.csv file

In [18]:
reducedEmbeddings3D = UMAP(n_neighbors=15, n_components=3, min_dist=0.0, metric='cosine', random_state=12345).fit_transform(abstract_embeddings)
reducedEmbeddings2D = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=12345).fit_transform(abstract_embeddings)

df = pd.read_csv('papers.csv')
df['x3D'] = reducedEmbeddings3D[:, 0]
df['y3D'] = reducedEmbeddings3D[:, 1]
df['z3D'] = reducedEmbeddings3D[:, 2]
df['x2D'] = reducedEmbeddings2D[:, 0]
df['y2D'] = reducedEmbeddings2D[:, 1]
df.to_csv('papers.csv', index=False)

### Remove columns from papers.csv that will not be used in the RShiny dashboard.

In [19]:
df = pd.read_csv('papers.csv')
df = df.drop(columns=['abstract', 'topicNumber'])
df.to_csv('papers.csv', index=False)