In [1]:
pip install bertopic



Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

In [3]:
preprocessed_df = pd.read_csv('CleanedLLMsPostsFromSO.csv')
cols = ['Id', 'Title', 'Text', 'OriginalText']
preprocessed_df = preprocessed_df[cols]
preprocessed_df

Unnamed: 0,Id,Title,Text,OriginalText
0,32333312,extract chunk BIO chunk sentence ? - python,extract chunk BIO chunk sentence ? - python ...,How to extract chunks from BIO chunked sentenc...
1,33695244,use Completion Suggester match ngram query,use Completion Suggester match ngram query I...,Use Completion Suggester to match against all ...
2,33941091,chunk document test plagiarism,chunk document test plagiarism I build plagi...,Chunking documents to test for plagiarism I a...
3,34090734,use nltk regex pattern extract specific phrase...,use nltk regex pattern extract specific phrase...,How to use nltk regex pattern to extract a spe...
4,34318427,itextsharp : word break split textchunk word,itextsharp : word break split textchunk word ...,itextsharp: words are broken when splitting te...
...,...,...,...,...
8588,78981951,set random seed Chroma DB ?,set random seed Chroma DB ? I m experiment d...,How do I set the random seed for Chroma DB? I...
8589,78982153,send parameter directly LLM langchain,send parameter directly LLM langchain curren...,How to send a parameter directly to LLM in lan...
8590,78984423,Azure Document Intelligence Custom Classificat...,Azure Document Intelligence Custom Classificat...,Azure Document Intelligence Custom Classificat...
8591,78984512,protect Routes Edge - Runtime t3 Stack / Verce...,protect Routes Edge - Runtime t3 Stack / Verce...,Protecting Routes in an Edge-Runtime with T3 S...


# Pre-calculate Embeddings

In [4]:
from sentence_transformers import SentenceTransformer

## Precalculate Embedding
embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-dot-v1")

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [5]:
## Lemmatize Original Text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pandas as pd
import re
from bs4 import BeautifulSoup

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import spacy

import nltk
nltk.download('punkt')
nlp =  spacy.load('en_core_web_sm')
import spacy
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '. join(lemmatized_tokens)


2024-10-10 12:26:47.143651: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-10 12:26:47.144976: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-10 12:26:47.161116: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-10 12:26:47.161134: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-10 12:26:47.161784: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [6]:
preprocessed_df['OriginalText'] = preprocessed_df['OriginalText'].apply(lemmatize_text)
preprocessed_df['OriginalText']

0       how to extract chunk from BIO chunk sentence ?...
1       use Completion Suggester to match against all ...
2       chunk document to test for plagiarism   I be b...
3       how to use nltk regex pattern to extract a spe...
4       itextsharp : word be break when split textchun...
                              ...                        
8588    how do I set the random seed for Chroma DB ?  ...
8589    how to send a parameter directly to LLM in lan...
8590    Azure Document Intelligence Custom Classificat...
8591    Protecting Routes in an Edge - runtime with t3...
8592    yolov5 Class Imbalance and Overfitting Issues ...
Name: OriginalText, Length: 8593, dtype: object

# Dimensionality Reduction

In [7]:
from umap import UMAP

umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='euclidean', random_state=42)
umap_model

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [8]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)

hdbscan_model

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [11]:
## Embedding Generation
embeddings_OriginalText = embedding_model.encode(preprocessed_df['OriginalText'], show_progress_bar=True)

Batches:   0%|          | 0/269 [00:00<?, ?it/s]

In [12]:
from bertopic import BERTopic
topic_model = BERTopic(
    embedding_model = embedding_model,
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    vectorizer_model = vectorizer_model,
    top_n_words = 20,
    verbose = True
)

In [13]:
## Topics Generation
topics, probs = topic_model.fit_transform(preprocessed_df['OriginalText'], embeddings_OriginalText)
topic_model.get_topic_info()

2024-10-10 12:28:58,935 - BERTopic - Reduced dimensionality
2024-10-10 12:28:59,368 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4344,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...
1,0,1782,0_api_openai_use_error,"[api, openai, use, error, code, response, try,...",[OpenAI API not read api key in my .env.local ...
2,1,606,1_gym_environment_openai gym_use,"[gym, environment, openai gym, use, action, tr...",[error in import environment OpenAI Gym / can ...
3,2,440,2_model_bert_huggingface_use,"[model, bert, huggingface, use, train, transfo...",[how Fine - tune my train model ( bert ) on an...
4,3,394,3_vector_document_chromadb_use,"[vector, document, chromadb, use, chroma, stor...",[how to retrieve id and metadata associate wit...
5,4,361,4_llama_gpu_model_use,"[llama, gpu, model, use, run, error, index, tr...",[unable for send multiple input use Llama CPP ...
6,5,258,5_whisper_audio_file_transcribe,"[whisper, audio, file, transcribe, use, audio ...",[I be use Whisper to transcribe and I be get t...
7,6,212,6_langchain_error_import_try,"[langchain, error, import, try, use, code, pyt...",[can not run simple intro langchain applicatio...
8,7,196,7_langchain_tool_use_agent,"[langchain, tool, use, agent, chain, prompt, a...","[LangChain , terminate a chain on specific too..."


In [14]:
topic_model.get_topic_info().to_csv('/home/kha060/PhD/Developer Challenges LLM/Stack_Overflow/GeneratedTopics/LLM-topics_info-Using-SO-data.csv')

# Document Distribution

In [15]:
document_distribution = topic_model.get_document_info(preprocessed_df['OriginalText'])
document_distribution

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,how to extract chunk from BIO chunk sentence ?...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
1,use Completion Suggester to match against all ...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
2,chunk document to test for plagiarism I be b...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
3,how to use nltk regex pattern to extract a spe...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
4,itextsharp : word be break when split textchun...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
...,...,...,...,...,...,...,...,...
8588,how do I set the random seed for Chroma DB ? ...,3,3_vector_document_chromadb_use,"[vector, document, chromadb, use, chroma, stor...",[how to retrieve id and metadata associate wit...,vector - document - chromadb - use - chroma - ...,1.000000,False
8589,how to send a parameter directly to LLM in lan...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
8590,Azure Document Intelligence Custom Classificat...,0,0_api_openai_use_error,"[api, openai, use, error, code, response, try,...",[OpenAI API not read api key in my .env.local ...,api - openai - use - error - code - response -...,0.898531,False
8591,Protecting Routes in an Edge - runtime with t3...,0,0_api_openai_use_error,"[api, openai, use, error, code, response, try,...",[OpenAI API not read api key in my .env.local ...,api - openai - use - error - code - response -...,0.927037,False


In [16]:
concatenated_df = pd.concat([preprocessed_df, document_distribution], axis=1)
concatenated_df

Unnamed: 0,Id,Title,Text,OriginalText,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,32333312,extract chunk BIO chunk sentence ? - python,extract chunk BIO chunk sentence ? - python ...,how to extract chunk from BIO chunk sentence ?...,how to extract chunk from BIO chunk sentence ?...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
1,33695244,use Completion Suggester match ngram query,use Completion Suggester match ngram query I...,use Completion Suggester to match against all ...,use Completion Suggester to match against all ...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
2,33941091,chunk document test plagiarism,chunk document test plagiarism I build plagi...,chunk document to test for plagiarism I be b...,chunk document to test for plagiarism I be b...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
3,34090734,use nltk regex pattern extract specific phrase...,use nltk regex pattern extract specific phrase...,how to use nltk regex pattern to extract a spe...,how to use nltk regex pattern to extract a spe...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
4,34318427,itextsharp : word break split textchunk word,itextsharp : word break split textchunk word ...,itextsharp : word be break when split textchun...,itextsharp : word be break when split textchun...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8588,78981951,set random seed Chroma DB ?,set random seed Chroma DB ? I m experiment d...,how do I set the random seed for Chroma DB ? ...,how do I set the random seed for Chroma DB ? ...,3,3_vector_document_chromadb_use,"[vector, document, chromadb, use, chroma, stor...",[how to retrieve id and metadata associate wit...,vector - document - chromadb - use - chroma - ...,1.000000,False
8589,78982153,send parameter directly LLM langchain,send parameter directly LLM langchain curren...,how to send a parameter directly to LLM in lan...,how to send a parameter directly to LLM in lan...,-1,-1_use_model_try_error,"[use, model, try, error, code, langchain, work...",[streamlit not take more than one prompt for c...,use - model - try - error - code - langchain -...,0.000000,False
8590,78984423,Azure Document Intelligence Custom Classificat...,Azure Document Intelligence Custom Classificat...,Azure Document Intelligence Custom Classificat...,Azure Document Intelligence Custom Classificat...,0,0_api_openai_use_error,"[api, openai, use, error, code, response, try,...",[OpenAI API not read api key in my .env.local ...,api - openai - use - error - code - response -...,0.898531,False
8591,78984512,protect Routes Edge - Runtime t3 Stack / Verce...,protect Routes Edge - Runtime t3 Stack / Verce...,Protecting Routes in an Edge - runtime with t3...,Protecting Routes in an Edge - runtime with t3...,0,0_api_openai_use_error,"[api, openai, use, error, code, response, try,...",[OpenAI API not read api key in my .env.local ...,api - openai - use - error - code - response -...,0.927037,False


In [17]:
distinct_topics = concatenated_df['Topic'].unique()
distinct_topics

array([-1,  1,  0,  6,  2,  3,  4,  5,  7])

In [18]:
for topic in distinct_topics:
    df_topic = concatenated_df[concatenated_df['Topic'] == topic]
    filename = f"/home/kha060/PhD/Developer Challenges LLM/Stack_Overflow/GeneratedTopics/{topic}_Topics_data.csv"
    df_topic.to_csv(filename)