In [2]:
!pip install llm

Collecting llm
  Downloading llm-0.18-py3-none-any.whl.metadata (6.6 kB)
Collecting click-default-group>=1.2.3 (from llm)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting sqlite-utils>=3.37 (from llm)
  Downloading sqlite_utils-3.37-py3-none-any.whl.metadata (7.6 kB)
Collecting sqlite-migrate>=0.1a2 (from llm)
  Downloading sqlite_migrate-0.1b0-py3-none-any.whl.metadata (5.4 kB)
Collecting python-ulid (from llm)
  Downloading python_ulid-3.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting puremagic (from llm)
  Downloading puremagic-1.28-py3-none-any.whl.metadata (5.8 kB)
Collecting sqlite-fts4 (from sqlite-utils>=3.37->llm)
  Downloading sqlite_fts4-1.0.3-py3-none-any.whl.metadata (6.6 kB)
Downloading llm-0.18-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading click_default_group-1.2.4-py2.py3-none-any.whl (4.1 kB)
Downloading sqlite_mi

In [3]:
import click
import json
import llm
import numpy as np
import sklearn.cluster
import sqlite_utils
import textwrap

DEFAULT_SUMMARY_PROMPT = """
Short, concise title for this cluster of related documents.
""".strip()


@llm.hookimpl
def register_commands(cli):
    @cli.command()
    @click.argument("collection")
    @click.argument("n", type=int)
    @click.option(
        "--truncate",
        type=int,
        default=100,
        help="Truncate content to this many characters - 0 for no truncation",
    )
    @click.option(
        "-d",
        "--database",
        type=click.Path(
            file_okay=True, allow_dash=False, dir_okay=False, writable=True
        ),
        envvar="LLM_EMBEDDINGS_DB",
        help="SQLite database file containing embeddings",
    )
    @click.option(
        "--summary", is_flag=True, help="Generate summary title for each cluster"
    )
    @click.option("-m", "--model", help="LLM model to use for the summary")
    @click.option("--prompt", help="Custom prompt to use for the summary")
    def cluster(collection, n, truncate, database, summary, model, prompt):
        """
        Generate clusters from embeddings in a collection

        Example usage, to create 10 clusters:

        \b
            llm cluster my_collection 10

        Outputs a JSON array of {"id": "cluster_id", "items": [list of items]}

        Pass --summary to generate a summary for each cluster, using the default
        language model or the model you specify with --model.
        """
        from llm.cli import get_default_model, get_key

        clustering_model = sklearn.cluster.MiniBatchKMeans(n_clusters=n, n_init="auto")
        if database:
            db = sqlite_utils.Database(database)
        else:
            db = sqlite_utils.Database(llm.user_dir() / "embeddings.db")
        rows = [
            (row[0], llm.decode(row[1]), row[2])
            for row in db.execute(
                """
            select id, embedding, content from embeddings
            where collection_id = (
                select id from collections where name = ?
            )
        """,
                [collection],
            ).fetchall()
        ]
        to_cluster = np.array([item[1] for item in rows])
        clustering_model.fit(to_cluster)
        assignments = clustering_model.labels_

        def truncate_text(text):
            if not text:
                return None
            if truncate > 0:
                return text[:truncate]
            else:
                return text

        # Each one corresponds to an ID
        clusters = {}
        for (id, _, content), cluster in zip(rows, assignments):
            clusters.setdefault(str(cluster), []).append(
                {"id": str(id), "content": truncate_text(content)}
            )
        # Re-arrange into a list
        output_clusters = [{"id": k, "items": v} for k, v in clusters.items()]

        # Do we need to generate summaries?
        if summary:
            model = llm.get_model(model or get_default_model())
            if model.needs_key:
                model.key = get_key("", model.needs_key, model.key_env_var)
            prompt = prompt or DEFAULT_SUMMARY_PROMPT
            click.echo("[")
            for cluster, is_last in zip(
                output_clusters, [False] * (len(output_clusters) - 1) + [True]
            ):
                click.echo("  {")
                click.echo('    "id": {},'.format(json.dumps(cluster["id"])))
                click.echo(
                    '    "items": '
                    + textwrap.indent(
                        json.dumps(cluster["items"], indent=2), "    "
                    ).lstrip()
                    + ","
                )
                prompt_content = "\n".join(
                    [item["content"] for item in cluster["items"] if item["content"]]
                )
                if prompt_content.strip():
                    summary = model.prompt(
                        prompt_content,
                        system=prompt,
                    ).text()
                else:
                    summary = None
                click.echo('    "summary": {}'.format(json.dumps(summary)))
                click.echo("  }" + ("," if not is_last else ""))
            click.echo("]")
        else:
            click.echo(json.dumps(output_clusters, indent=4))

In [8]:
import numpy as np
import llm
from sklearn.cluster import KMeans



# Create a collection and embed the documents
collection = llm.Collection("documents", model_id="sentence-transformers/all-MiniLM-L6-v2")
embeddings = []
valid_documents = []

for i, doc in enumerate(documents):
    try:
        embedding = collection.embed(f"doc_{i}", doc, store=True)
        if embedding is not None:
            embeddings.append(embedding)
            valid_documents.append(doc)
        else:
            print(f"Warning: Embedding for document {i} is None")
    except Exception as e:
        print(f"Error embedding document {i}: {str(e)}")

# Print the number of embeddings
print(f"Number of embeddings: {len(embeddings)}")

# Convert embeddings to numpy array
if embeddings:
    embeddings_array = np.array(embeddings)
    print(f"Shape of embeddings array: {embeddings_array.shape}")

    # Check for NaN values
    nan_count = np.isnan(embeddings_array).sum()
    print(f"Number of NaN values: {nan_count}")

    # Remove any NaN values
    embeddings_array = embeddings_array[~np.isnan(embeddings_array).any(axis=1)]
    print(f"Shape after removing NaNs: {embeddings_array.shape}")

    # Check if we have any valid embeddings
    if embeddings_array.size > 0:
        # Perform K-means clustering
        num_clusters = min(3, len(embeddings_array))  # Ensure we don't have more clusters than data points
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(embeddings_array)

        # Print the clustering results
        for i, (doc, label) in enumerate(zip(valid_documents, cluster_labels)):
            print(f"Document {i}: Cluster {label}")
            print(f"Content: {doc}")
            print()

        # Use llm-cluster to generate summaries for each cluster
        try:
            from llm_cluster import cluster_embeddings
            cluster_results = cluster_embeddings(collection, num_clusters, summary=True)

            # Print the cluster summaries
            for cluster in cluster_results:
                print(f"Cluster {cluster['id']}:")
                print(f"Summary: {cluster['summary']}")
                print("Items:")
                for item in cluster['items']:
                    print(f"- {item['content']}")
                print()
        except Exception as e:
            print(f"An error occurred while generating cluster summaries: {str(e)}")
    else:
        print("No valid embeddings after removing NaNs. Cannot perform clustering.")
else:
    print("No valid embeddings. Cannot perform clustering.")

Number of embeddings: 0
No valid embeddings. Cannot perform clustering.


In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
for i, doc in enumerate(documents):
    print(f"Document {i}: {doc}")

Document 0: Machine learning is a subset of artificial intelligence.
Document 1: Natural language processing deals with the interaction between computers and human language.
Document 2: Deep learning uses neural networks with multiple layers.
Document 3: Reinforcement learning is learning what to do to maximize a reward.
Document 4: Computer vision is the field of AI that trains computers to interpret visual information.
Document 5: Clustering is an unsupervised learning technique.
Document 6: Classification is a supervised learning task.
Document 7: Regression predicts continuous values.
Document 8: Neural networks are inspired by the human brain.
Document 9: Support vector machines are used for classification and regression tasks.


In [11]:
embeddings = model.encode(documents)
print(f"Shape of embeddings: {embeddings.shape}")

Shape of embeddings: (10, 384)


In [12]:
embeddings = []
for i, doc in enumerate(documents):
    try:
        embedding = model.encode(doc)
        embeddings.append(embedding)
    except Exception as e:
        print(f"Error embedding document {i}: {str(e)}")

embeddings_array = np.array(embeddings)

In [13]:
batch_size = 10
all_embeddings = []
for i in range(0, len(documents), batch_size):
    batch = documents[i:i+batch_size]
    batch_embeddings = model.encode(batch)
    all_embeddings.extend(batch_embeddings)

embeddings_array = np.array(all_embeddings)

In [15]:
import numpy as np
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import csv

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Read the CSV file and extract the text to be embedded
# Create a sample dataset of documents
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Natural language processing deals with the interaction between computers and human language.",
    "Deep learning uses neural networks with multiple layers.",
    "Reinforcement learning is learning what to do to maximize a reward.",
    "Computer vision is the field of AI that trains computers to interpret visual information.",
    "Clustering is an unsupervised learning technique.",
    "Classification is a supervised learning task.",
    "Regression predicts continuous values.",
    "Neural networks are inspired by the human brain.",
    "Support vector machines are used for classification and regression tasks."
]

# Generate embeddings
embeddings = model.encode(documents)

# Perform K-means clustering
num_clusters = 3  # Adjust as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# Print the clustering results
for i, (doc, label) in enumerate(zip(documents, cluster_labels)):
    print(f"Document {i}: Cluster {label}")
    print(f"Content: {doc[:100]}...")  # Print first 100 characters
    print()

# Optional: Generate summaries for each cluster
# This part depends on how you want to summarize the clusters
# You might need to implement a custom summarization method

Document 0: Cluster 2
Content: Machine learning is a subset of artificial intelligence....

Document 1: Cluster 1
Content: Natural language processing deals with the interaction between computers and human language....

Document 2: Cluster 2
Content: Deep learning uses neural networks with multiple layers....

Document 3: Cluster 0
Content: Reinforcement learning is learning what to do to maximize a reward....

Document 4: Cluster 2
Content: Computer vision is the field of AI that trains computers to interpret visual information....

Document 5: Cluster 1
Content: Clustering is an unsupervised learning technique....

Document 6: Cluster 1
Content: Classification is a supervised learning task....

Document 7: Cluster 1
Content: Regression predicts continuous values....

Document 8: Cluster 2
Content: Neural networks are inspired by the human brain....

Document 9: Cluster 1
Content: Support vector machines are used for classification and regression tasks....



# Document Clustering Analysis

This Colab notebook demonstrates the clustering of AI and machine learning-related documents using embeddings and K-means clustering.

## Clustering Results

The documents were clustered into three groups:

### Cluster 0: Reinforcement Learning
- Document 3: Reinforcement learning is learning what to do to maximize a reward.

### Cluster 1: General ML Techniques and Tasks
- Document 1: Natural language processing deals with the interaction between computers and human language.
- Document 5: Clustering is an unsupervised learning technique.
- Document 6: Classification is a supervised learning task.
- Document 7: Regression predicts continuous values.
- Document 9: Support vector machines are used for classification and regression tasks.

### Cluster 2: Neural Networks and AI Subfields
- Document 0: Machine learning is a subset of artificial intelligence.
- Document 2: Deep learning uses neural networks with multiple layers.
- Document 4: Computer vision is the field of AI that trains computers to interpret visual information.
- Document 8: Neural networks are inspired by the human brain.

## Analysis of the Clustering

1. **Cluster 0 (Reinforcement Learning)**:
   - Contains only one document, suggesting that reinforcement learning is semantically distinct from other topics.

2. **Cluster 1 (General ML Techniques and Tasks)**:
   - Groups various machine learning techniques and tasks.
   - Includes both supervised (classification, regression) and unsupervised (clustering) learning methods.
   - Natural language processing is included, possibly due to its frequent use of these techniques.

3. **Cluster 2 (Neural Networks and AI Subfields)**:
   - Focuses on neural networks and broader AI concepts.
   - Includes documents mentioning neural networks, deep learning, and computer vision.
   - The general concept of machine learning as a subset of AI is also included.

## Observations

- The clustering algorithm (likely K-means) grouped these documents based on their semantic similarity.
- Reinforcement learning was separated into its own cluster, highlighting its distinctiveness.
- General machine learning techniques and tasks are grouped together in Cluster 1.
- Neural network-related concepts and broader AI subfields are grouped in Cluster 2.

## Conclusion

This clustering appears reasonable, as it has grouped related concepts together while separating distinct topics. The exact reasons for these groupings depend on the specific embeddings generated for each document and the nature of the clustering algorithm used.

