In [None]:
# Install required libraries
!pip install faiss-cpu
!pip install sentence-transformers
!pip install openai
!pip install openai langchain pandas
!pip install openai langchain langchain_community

In [2]:
import openai
from sentence_transformers import SentenceTransformer
import json
import os
import pickle
import numpy as np
from collections import defaultdict
import faiss
import regex as re
import math
from langchain.chat_models import ChatOpenAI

  from tqdm.autonotebook import tqdm, trange


In [3]:
embed_model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m-v1.5")
print("Setup complete!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/253 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/272k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/108 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Setup complete!


In [4]:
openai_api_key = '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'

In [8]:
def filter_comments(video_comments):
    filtered_comments = {}
    for video_id, comments in video_comments.items():
        filtered_comments[video_id] = [
            comment for comment in comments
            if is_valid_comment(comment)
        ]
    return filtered_comments

def is_valid_comment(comment):
    comment = comment.strip()

    # Condition 1: Non-empty and >2 words
    if len(comment) == 0:
        return False

    # Condition 2: Not purely numbers or special characters
    if re.fullmatch(r"[^\w\s]+", comment) or re.fullmatch(r"\d+", comment):
        return False

    # Condition 3: No excessive repeated characters (e.g., "loooool" or "aaaaaa")
    if re.search(r"(.)\1{20,}", comment):  # Twenty or more consecutive repeated characters
        return False

    # Condition 4: Not spam-like (e.g., "www.example.com" or repeated phrases)
    if re.search(r"(https?:\/\/|www\.)", comment):  # Links
        return False
    if re.search(r"(buy now|subscribe|click here|free money)", comment, re.IGNORECASE):  # Common spam phrases
        return False

    return True


In [6]:
def compute_and_store_embeddings(comment_dict, embed_model, save_path="embeddings_progress.pkl"):
    # Load existing progress if available
    if os.path.exists(save_path):
        with open(save_path, "rb") as f:
            embeddings = pickle.load(f)
        print(f"Loaded progress from {save_path}")
    else:
        embeddings = {}

    total_comments = sum(len(comments) for comments in comment_dict.values())
    processed_comments = len(embeddings)
    print(f"Total comments to process: {total_comments}, already processed: {processed_comments}")

    # Flag to track if any new embeddings were added
    new_embeddings_computed = False

    for video_id, comments in comment_dict.items():
        for comment in comments:
            if comment in embeddings:
                continue
            try:
                embeddings[comment] = embed_model.encode(comment)
                processed_comments += 1
                new_embeddings_computed = True  # Mark that new embeddings were added
                if processed_comments % 2000 == 0:
                    print(f"Processed {processed_comments}/{total_comments} comments")

            except Exception as e:
                print(f"Error processing '{comment}': {e}")

    # Save only if new embeddings were computed
    if new_embeddings_computed:
        with open(save_path, "wb") as f:
            pickle.dump(embeddings, f)
        print(f"Saved updated embeddings to {save_path}")
    else:
        print("No new embeddings were computed. Skipping save.")

    print(f"Embedding computation complete. Total embeddings: {len(embeddings)}")
    return embeddings


In [7]:
def cluster_comments(video_comments, embeddings):
    """
    Clusters the video comments dynamically based on the square root of the number of comments.

    :param video_comments: List of comments.
    :param embeddings: List of embeddings corresponding to the comments.
    :return: A dictionary of clustered comments where keys are cluster indices and values are lists of comments.
    """
    embeddings = np.array(embeddings).astype("float32")

    # Normalize embeddings to unit length
    faiss.normalize_L2(embeddings)

    # Calculate the number of clusters dynamically based on the square root heuristic
    n_clusters = int(math.sqrt(len(embeddings)))

    # Ensure at least 1 cluster
    n_clusters = max(1, n_clusters)

    dim = embeddings.shape[1]
    flat_index = faiss.IndexFlatIP(dim)
    flat_index.add(embeddings)

    # Perform clustering
    clustering = faiss.Clustering(dim, n_clusters)
    clustering.train(embeddings, flat_index)

    # Assign comments to clusters
    distances, cluster_indices = flat_index.search(embeddings, 1)
    clustered_comments = defaultdict(list)
    for i, cluster_idx in enumerate(cluster_indices.flatten()):
        clustered_comments[cluster_idx].append(video_comments[i])

    return clustered_comments


In [9]:
def cluster_all_comments(embeddings_dict):
    """
    Clusters all comments dynamically based on the square root of the number of comments.

    :param embeddings_dict: Dictionary mapping comments to their embeddings.
    :return: A dictionary of clusters where keys are cluster indices and values are lists of comments.
    """
    all_comments = list(embeddings_dict.keys())
    embeddings = np.array([embeddings_dict[comment] for comment in all_comments]).astype("float32")

    # Normalize embeddings to unit length
    faiss.normalize_L2(embeddings)

    # Calculate the number of clusters dynamically based on the square root heuristic
    n_clusters = int(math.sqrt(len(embeddings)))

    # Ensure at least 1 cluster
    n_clusters = max(1, n_clusters)

    dim = embeddings.shape[1]
    flat_index = faiss.IndexFlatIP(dim)
    flat_index.add(embeddings)

    # Perform global clustering
    clustering = faiss.Clustering(dim, n_clusters)
    clustering.train(embeddings, flat_index)

    # Assign comments to clusters
    _, cluster_indices = flat_index.search(embeddings, 1)
    global_clusters = defaultdict(list)
    for i, cluster_idx in enumerate(cluster_indices.flatten()):
        global_clusters[cluster_idx].append(all_comments[i])

    print(f"Total clusters formed: {len(global_clusters)}")
    return global_clusters


In [10]:
def save_classification(classification, save_path="classification.json"):
    with open(save_path, "w") as f:
        json.dump(classification, f, indent=4)

In [11]:
# # Initialize the LangChain ChatOpenAI wrapper with the correct model and API key
# llm = ChatOpenAI(model_name="gpt-4", openai_api_key=openai_api_key)

# Set your OpenAI API key
openai_api_key = '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'
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=openai_api_key)

# Define the classification function for a single representative comment
def classify_representative(representative_comment):
    """
    Classifies a single representative comment into a category.

    :param representative_comment: The text of the representative comment.
    :return: Classification in the format "Main Category - Subcategory".
    """
    # Define the system message prompt
    system_message = """You are a helpful assistant. Your task is to classify YouTube music video comments into one of two main categories: Useful or Useless.
    Each category has specific subcategories:

    1. Useful Comments
       - Emotional Connection
       - Technical Appreciation
       - Artist Appreciation

    2. Useless Comments
       - Ambiguous Comments
       - Lightly Relevant Comments
       - Irrelevant Comments

    Please provide the main category and the subcategory for each comment in the format: "Main Category - Subcategory".
    """

    # Initialize the message with the system prompt and user input
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": representative_comment.strip()}
    ]

    # Call the LLM for classification
    try:
        response = llm.invoke(messages)
        classification = response.content.strip()  # Access the content directly
        return classification
    except Exception as e:
        print(f"Error classifying comment: {representative_comment}")
        print(f"Error: {e}")
        return "Error - Classification Failed"

  llm = ChatOpenAI(model_name="gpt-4", openai_api_key=openai_api_key)


In [12]:
def find_centroid_representative(cluster_embeddings, comments):
    """
    Find the representative comment (centroid) of the cluster by selecting the comment closest to the centroid of the embeddings.

    :param cluster_embeddings: The embeddings of the comments in the cluster.
    :param comments: The list of comment IDs.
    :return: The comment that is closest to the centroid.
    """
    centroid = np.mean(cluster_embeddings, axis=0)  # Compute the centroid (average) of the embeddings

    # Compute the distances of each embedding to the centroid
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)

    # Find the index of the closest comment to the centroid
    closest_idx = np.argmin(distances)

    # Return the comment closest to the centroid
    return comments[closest_idx]

def classify_clusters(global_clusters, embeddings_dict):
    """
    Classifies each cluster using its representative comment and stores the classification results for all comments.

    :param global_clusters: A dictionary with cluster IDs as keys and lists of comment IDs as values.
    :param embeddings_dict: A dictionary with comment IDs as keys and their corresponding embeddings as values.
    :return: A dictionary with comment IDs as keys and their classification as values.
    """
    results = {}  # Store the classification results
    for cluster_id, comments in global_clusters.items():
        # Extract the embeddings for the comments in this cluster
        cluster_embeddings = np.array([embeddings_dict[c] for c in comments])

        # Find the representative (centroid) of the cluster
        representative_comment = find_centroid_representative(cluster_embeddings, comments)

        # Classify the representative comment
        classification = classify_representative(representative_comment)

        # Save the classification for all comments in the cluster
        for comment in comments:
            results[comment] = classification

        # Optionally, save the classification results to a file
        save_classification(results)

    return results


In [13]:
def map_comments_to_classification(global_clusters, classified_clusters, embeddings_dict, output_file="mapped_comments.json"):
    """
    Maps each original comment to its respective classification using cluster classifications.

    :param global_clusters: A dictionary where keys are cluster IDs and values are lists of comment indices.
    :param classified_clusters: A dictionary where keys are cluster IDs and values are the classification of that cluster.
    :param embeddings_dict: A dictionary of comments (keys) and their embeddings (values).
    :param output_file: Path to the output JSON file where the mappings will be saved.
    """
    # Extract comments from embeddings_dict (keys)
    comments = list(embeddings_dict.keys())

    # Dictionary to store the final classification of each original comment
    mapped_comments = {}

    # Iterate over each cluster and its associated comments
    for cluster_id, comment_indices in global_clusters.items():
        # Get the classification for this cluster
        cluster_classification = classified_clusters.get(cluster_id, {}).get("classification", "Unclassified")

        # For each comment in this cluster, assign it the same classification as the representative
        for idx in comment_indices:
            original_comment = comments[idx]
            mapped_comments[original_comment] = {
                "classification": cluster_classification
            }

    # Save the mapped comments to a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(mapped_comments, f, indent=2)

    print(f"Mapped comments with classifications have been saved to {output_file}")


In [14]:
def process_video_comments(video_comments, embed_model, output_file="classified_comments.json"):

    # Step 0: Filter out empty or irrelevant comments
    print("Filtering comments...")
    video_comments = filter_comments(video_comments)

    # Step 1: Compute embeddings and save progress
    print("Computing embeddings...")
    embeddings_dict = compute_and_store_embeddings(video_comments, embed_model)

    # Step 2: Process each video for clustering and classification
    print("Clustering comments...")
    global_clusters = cluster_all_comments(embeddings_dict)

    # Step 3: Classify clusters
    print("Classifying clusters...")
    classified_clusters = classify_clusters(global_clusters, embeddings_dict)

    # Step 4: Save final results to a file
    print("Saving results...")
    with open(output_file, "w") as f:
        json.dump(classified_clusters, f, indent=2)
    print(f"Final results saved to {output_file}")


In [15]:
with open("RandomSample_500.json", "r") as f:
    video_comments = json.load(f)


process_video_comments(video_comments, embed_model)

Filtering comments...
Computing embeddings...
Total comments to process: 500, already processed: 0
Saved updated embeddings to embeddings_progress.pkl
Embedding computation complete. Total embeddings: 498
Clustering comments...
Total clusters formed: 22
Classifying clusters...
Saving results...
Final results saved to classified_comments.json
