# Mapping Top-5 ASD Terms Using OpenAI Embeddings and BERTopic

This notebook demonstrates how to use OpenAI embeddings and BERTopic modeling to identify ASD behavior patterns for supporting ASD treatment planning.

In [None]:

import os
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
import textwrap

# Check if CUDA is available and set the device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load data
data = pd.read_csv('/home/skbae/Documents/skbae/ASD/PJT_Data/Questions/Work/cdb_advanced_ASD_F3.csv')

# Preprocess the data
def preprocess_text(text):
    return " ".join([word.lower() for word in text.split()])

data['Processed_Name'] = data['Name'].apply(preprocess_text)

# Create FAISS vector store with OpenAI embeddings
vectorstore = FAISS.from_texts(data['Processed_Name'].tolist(), embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

# Load clinical notes
df_merged = pd.read_csv('./df_merged_summary2_LLM_F_Apr012F.csv', encoding='utf-8')


## Step 1: Process Text to Find Top-5 ASD Terms

In [None]:

# Define a function to find the top-5 similar terms
max_length = 50
embeddings_model = OpenAIEmbeddings()

def process_text_row(text):
    chunks = textwrap.wrap(text, max_length)
    if not chunks:
        return None, None

    chunks_embeddings = embeddings_model.embed_documents(chunks)
    processed_name_embeddings = embeddings_model.embed_documents(data['Processed_Name'].tolist())

    cosine_scores = util.pytorch_cos_sim(chunks_embeddings, processed_name_embeddings)
    scores_numpy = cosine_scores.cpu().numpy()
    indices = np.argsort(scores_numpy, axis=1)[:, ::-1]

    top_5_similar = data['Processed_Name'].iloc[indices[0, :5]].tolist()
    top_5_id = data['ID'].iloc[indices[0, :5]].tolist()

    top_5_similar_str = "', '".join(top_5_similar)
    top_5_id_str = "', '".join(top_5_id)

    return f"('{top_5_similar_str}')", f"('{top_5_id_str}')"

# Apply the function to the dataset
df_merged['Top_5_Similar_Processed_NameF'], df_merged['Top_5_IDF'] = zip(*df_merged['summarized_text_llm2F'].apply(process_text_row))
df_merged.to_csv('./df_top5_preprocessing_OPENAI_RAG_summarized_similarity_Apr12F2_openAI_ID_F.csv', index=False)


## Step 2: Group and Merge Results

In [None]:

# Load the processed data
df_openAI = pd.read_csv('./df_top5_preprocessing_OPENAI_RAG_summarized_similarity_Apr12F2_openAI_ID_F.csv', encoding='utf-8')

# Fill NaN values
df_openAI['Top_5_Similar_Processed_NameF'] = df_openAI['Top_5_Similar_Processed_NameF'].fillna('')
df_openAI['Top_5_IDF'] = df_openAI['Top_5_IDF'].fillna('')

# Group by 'patient_key' and merge text
merged_df = df_openAI.groupby('patient_key')['Top_5_Similar_Processed_NameF'].apply(', '.join).reset_index()
merged_df_id = df_openAI.groupby('patient_key')['Top_5_IDF'].apply(', '.join).reset_index()

# Save merged results
merged_df.to_csv('./merged_top5_ASD_terms.csv', index=False)


## Step 3: BERTopic Modeling for Behavior Pattern Identification

In [None]:

# Pre-calculate embeddings for BERTopic
docs = merged_df['Top_5_Similar_Processed_NameF'].apply(str).tolist()
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

# Initialize UMAP and HDBSCAN models
umap_model = UMAP(n_neighbors=6, n_components=6, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer = CountVectorizer(ngram_range=(1, 1))

# Create a BERTopic model
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    ctfidf_model=ClassTfidfTransformer(),
    calculate_probabilities=True,
    top_n_words=10,
    verbose=True
)

# Fit BERTopic on the data
topics, probs = topic_model.fit_transform(docs, embeddings)

# Save topic information
topic_info = topic_model.get_topic_info()
topic_info.to_csv('./topic_info_Apr16.csv', index=False)


## Step 4: Visualization and Insights

In [None]:

# Visualize the topics
topic_model.visualize_barchart()
topic_model.visualize_topics()

# Display topic details
for i in range(len(topic_model.get_topic_info()) - 1):
    topic_words = topic_model.get_topic(i)
    print(f"Topic {i}: {topic_words}\n")
