In [None]:
%%capture
# BERTopic + llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
!pip install bertopic datasets

# DataMapPlot
!git clone https://github.com/TutteInstitute/datamapplot.git
!pip install datamapplot/.

# GPU-accelerated HDBSCAN + UMAP
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cupy-cuda12x -f https://pip.cupy.dev/aarch64

!wget https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf
# !wget https://huggingface.co/TheBloke/dolphin-2.7-mixtral-8x7b-GGUF/resolve/main/dolphin-2.7-mixtral-8x7b.Q3_K_M.gguf

In [None]:
%pip install boto3==1.18.62
%pip install google-api-python-client==2.112.0 
%pip install langchain==0.1.4 
%pip install python-dotenv==1.0.0
%pip install tiktoken==0.5.2
%pip install streamlit==1.36.0
%pip install tqdm==4.66.1
%pip install youtube-transcript-api==0.6.2

In [None]:
import os
import sys
import json
from langchain.docstore.document import Document

from datetime import datetime, timedelta

import boto3
from dateutil.tz import tzutc
from google.colab import userdata


AWS_ACCESS_KEY_ID = userdata.get('AWS_ACCESS_KEY_ID')
AWS_ACCESS_SECRET_KEY = userdata.get('AWS_SECRET_ACCESS_KEY')
S3_BUCKET_NAME = userdata.get('S3_BUCKET_NAME')
AWS_REGION = userdata.get('AWS_REGION')


class S3Utils:
    def __init__(self):
        self.client = boto3.client(
            "s3",
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_ACCESS_SECRET_KEY,
        )

    def get_s3_objects(self, file_path):
        """
        Retrieves an object from an S3 bucket.

        Args:
            file_path (str): The path of the file in the S3 bucket.

        Returns:
            dict: The response object containing the retrieved object.

        """
        res = self.client.get_object(Bucket=S3_BUCKET_NAME, Key=file_path)
        return res

    def download_s3_file(self, local_file_name, s3_object_key):
        meta_data = self.client.head_object(Bucket=S3_BUCKET_NAME, Key=s3_object_key)
        total_length = int(meta_data.get("ContentLength", 0))
        downloaded = 0

        def progress(chunk):
            nonlocal downloaded
            downloaded += chunk
            done = int(50 * downloaded / total_length)
            sys.stdout.write("\r[%s%s]" % ("=" * done, " " * (50 - done)))
            sys.stdout.flush()

        with open(local_file_name, "wb") as f:
            self.client.download_fileobj(
                S3_BUCKET_NAME, s3_object_key, f, Callback=progress
            )

        print(
            f"\nDownloaded {local_file_name} from S3 bucket {S3_BUCKET_NAME} Key {s3_object_key}"
        )

    def upload_file_to_s3(self, file_path, uploadto):
        """
        Uploads a file to Amazon S3.

        Args:
            file_path (str): The path of the file to be uploaded.
            uploadto (str): The destination folder in S3 where the file will be uploaded.

        Returns:
            str: The downloadable URL of the uploaded file.
        """
        self.client.upload_file(
            Bucket=S3_BUCKET_NAME,
            Filename=file_path,
            Key=uploadto,
        )

    def upload_folder_to_s3(self, local_folder, s3_folder):
        """
        Uploads all files in a local folder to a specified S3 folder.

        Args:
            local_folder (str): The path to the local folder containing the files to upload.
            s3_folder (str): The path to the S3 folder where the files will be uploaded.

        Returns:
            None
        """
        for root, dirs, files in os.walk(local_folder):
            for file in files:
                local_path = os.path.join(root, file)
                relative_path = os.path.relpath(local_path, local_folder)
                s3_path = os.path.join(s3_folder, relative_path)
                self.client.upload_file(local_path, S3_BUCKET_NAME, s3_path)

    def upload_file_text_to_s3(self, text, uploadto):
        """
        Uploads a text file to an S3 bucket.

        Args:
            text (str): The text content to be uploaded.
            uploadto (str): The key or path where the file will be uploaded in the S3 bucket.

        Returns:
            None
        """
        self.client.put_object(Bucket=S3_BUCKET_NAME, Key=uploadto, Body=text)

    def download_folder_from_s3(self, s3_folder, local_folder):
        # Get all objects in the S3 folder
        response = self.client.list_objects_v2(Bucket=S3_BUCKET_NAME, Prefix=s3_folder)

        if response.get("Contents"):
            objects = response["Contents"]

            # Download each object
            for obj in objects:
                s3_path = obj["Key"]
                local_path = os.path.join(
                    local_folder, os.path.relpath(s3_path, s3_folder)
                )
                local_dir = os.path.dirname(local_path)

                # Create local directories if they don't exist
                if not os.path.exists(local_dir):
                    os.makedirs(local_dir)

                # Download the object
                self.client.download_file(S3_BUCKET_NAME, s3_path, local_path)
            return True
        else:
            return False

    def filterd_list_md5_checksum(self, directory_name):
        """
        Filters the list of objects in the specified S3 bucket directory based on the last modified
        timestamp and file extensions.

        Args:
            directory_name (str): The name of the S3 bucket directory to filter.

        Returns:
            list: A list of MD5 checksums for the filtered S3 objects.
        """
        condition_timestamp = datetime.now(tz=tzutc()) - timedelta(minutes=5)
        paginator = self.client.get_paginator("list_objects_v2")
        s3_filtered_list = []

        for page in paginator.paginate(Bucket=S3_BUCKET_NAME, Prefix=directory_name):
            if "Contents" in page:
                for obj in page["Contents"]:
                    if obj["LastModified"] > condition_timestamp and (
                        obj["Key"].lower().endswith(".pdf")
                        or obj["Key"].lower().endswith(".txt")
                    ):
                        s3_filtered_list.append(obj)

        object_md5_checksums = [obj["ETag"].strip('"') for obj in s3_filtered_list]

        return object_md5_checksums


S3 = S3Utils()

def load_docs_from_jsonl(local_path):
    documents = []
    with open(local_path, "r") as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            documents.append(obj)
    return documents


def dowload_document(uuid, local_file_path):
    S3.download_s3_file(
        local_file_path, f"{uuid}/{os.path.basename(local_file_path)}"
    )
    documents = load_docs_from_jsonl(local_file_path)
    return documents

In [None]:
docs = dowload_document('8adf1145-b055-4710-b26a-8f52fac88def', 'Ludwig_2a1ddefc8bcb4653951633c58a01b663_documents.jsonl')

In [None]:
texts = []
for doc in docs:
  texts.append(doc.page_content)

docs = texts

In [None]:
from llama_cpp import Llama

# Use llama.cpp to load in a Quantized LLM
llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])

In [None]:
from bertopic.representation import KeyBERTInspired, LlamaCPP

prompt = """ Q:
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the above information, can you give a short label of the topic of at most 5 words?
A:
"""

representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "LLM": LlamaCPP(llm, prompt=prompt),
}

## BERTopic

In [None]:
from sentence_transformers import SentenceTransformer
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
# from umap import UMAP
# from hdbscan import HDBSCAN

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

In [None]:
# Define sub-models
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

## Training

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
import datamapplot
import re

# Create a label for each document
llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
llm_labels = [label if label else "Unlabelled" for label in llm_labels]
all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics]

# Run the visualization
datamapplot.create_plot(
    reduced_embeddings,
    all_labels,
    label_font_size=11,
    title="ArXiv - BERTopic",
    sub_title="Topics labeled with `openhermes-2.5-mistral-7b`",
    label_wrap_width=20,
    use_medoids=True,
    logo_width=0.16
)