Testing Clustering performance using synthetic numeric data

**Silhouette Score:**
- Range: Values range from -1 to +1.
- +1: Indicates the data point is well-matched to its own cluster and poorly matched to neighboring clusters.
- 0: Suggests the data point is on or very close to the decision boundary between two neighboring clusters.
- -1: Implies the data point is assigned to the wrong cluster.



**Davies-Bouldin Index:**
Range: Values range from 0 to infinity.
- 0: Represents the ideal scenario with perfectly separated and dissimilar clusters.

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Parameters for the synthetic dataset
num_samples = 500  # Total samples
num_features = 10   # Number of features (dimensionality of each sample)
num_clusters = 5    # Number of clusters

# Generate synthetic data
X, y = make_blobs(n_samples=num_samples, centers=num_clusters, n_features=num_features, random_state=42)
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(num_features)])
df['true_cluster'] = y  # Adding true cluster for validation

# Convert to a DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(num_features)])
df['cluster'] = y

In [None]:
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Step 3: Evaluate clustering performance
silhouette_avg = silhouette_score(X, df['cluster'])
davies_bouldin = davies_bouldin_score(X, df['cluster'])

print(f'Silhouette Score: {silhouette_avg}')
print(f'Davies-Bouldin Index: {davies_bouldin}')

Silhouette Score: 0.7593905670910638
Davies-Bouldin Index: 0.34583710789535893


Testing Clustering Performance using textual synthetic data

In [None]:
!pip install pandas numpy openai



In [None]:
pip install openai==0.28



In [None]:
import openai
import pandas as pd
import numpy as np

# Set your OpenAI API key
openai.api_key = ''

In [None]:
def generate_synthetic_texts(num_samples, topics):
    texts = []
    samples_per_topic = num_samples // len(topics)

    for topic in topics:
        # Generate synthetic texts for each topic
        for _ in range(samples_per_topic):
            prompt = f"Generate a short text related to the topic '{topic}'. Generate the text in such a way that a customer is describing about an issue that is related to that topic."
            response = openai.ChatCompletion.create(
                model='gpt-4o-mini',
                messages=[{"role": "user", "content": prompt}]
            )
            texts.append(response.choices[0].message['content'].strip())

    return texts

In [None]:
topics = ['Software Upgrade Assistance', 'Database and Report Issues', 'Technical Issue', 'Email Delivery Problem', 'Financial Reporting Discrepancies']
num_samples = 50

# Generate synthetic texts
synthetic_texts = generate_synthetic_texts(num_samples, topics)

In [None]:
def get_embeddings(texts):
    # Use OpenAI's API to get embeddings
    responses = openai.Embedding.create(
        model="text-embedding-3-small",
        input=texts
    )
    embeddings = [response['embedding'] for response in responses['data']]
    return np.array(embeddings)

In [None]:
df = pd.DataFrame(synthetic_texts, columns=['text'])
df['true_cluster'] = np.repeat(range(len(topics)), num_samples // len(topics))

embeddings = get_embeddings(df['text'].tolist())

In [None]:
num_clusters = len(topics)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(embeddings)

In [None]:
silhouette_avg = silhouette_score(embeddings, df['cluster'])
davies_bouldin = davies_bouldin_score(embeddings, df['cluster'])

print(f'Silhouette Score: {silhouette_avg}')
print(f'Davies-Bouldin Index: {davies_bouldin}')

Silhouette Score: 0.4399515106478122
Davies-Bouldin Index: 1.1030071312740273


In [None]:
import openai
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

openai.api_key = ''

def generate_synthetic_texts(num_samples, topics):
    texts = []
    samples_per_topic = num_samples // len(topics)

    for topic in topics:
        # Generate synthetic texts for each topic
        for _ in range(samples_per_topic):
            prompt = f"Generate a short text related to the topic '{topic}'. Generate the text in such a way that a customer is describing about an issue that is related to that topic."
            response = openai.ChatCompletion.create(
                model='gpt-4o-mini',
                messages=[{"role": "user", "content": prompt}]
            )
            texts.append(response.choices[0].message['content'].strip())

    return texts

def get_embeddings(texts):
    # Use OpenAI's API to get embeddings
    responses = openai.Embedding.create(
        model="text-embedding-3-small",  # Use the desired embedding model
        input=texts
    )
    embeddings = [response['embedding'] for response in responses['data']]
    return np.array(embeddings)

# Define Topics and Generate Data
topics = ['Software Upgrade Assistance', 'Database and Report Issues', 'Technical Issue', 'Email Delivery Problem', 'Financial Reporting Discrepancies']
num_samples = 50

# Generate synthetic texts
synthetic_texts = generate_synthetic_texts(num_samples, topics)

# Create DataFrame
df = pd.DataFrame(synthetic_texts, columns=['text'])
df['true_cluster'] = np.repeat(range(len(topics)), num_samples // len(topics))

# Get embeddings for the text data
embeddings = get_embeddings(df['text'].tolist())

# Cluster the Data Using KMeans
num_clusters = len(topics)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(embeddings)

# Evaluate Clustering Performance
silhouette_avg = silhouette_score(embeddings, df['cluster'])
davies_bouldin = davies_bouldin_score(embeddings, df['cluster'])

print(f'Silhouette Score: {silhouette_avg}')
print(f'Davies-Bouldin Index: {davies_bouldin}')

Silhouette Score: 0.4273070698254832
Davies-Bouldin Index: 1.1459411652485025
                                                text  true_cluster  cluster
0  Customer support plays a vital role in enhanci...             0        2
1  Customer support is an essential component of ...             0        2
2  Customer support is the backbone of any succes...             0        2
3  Customer support plays a crucial role in ensur...             0        2
4  Customer support is crucial for ensuring the s...             0        2


In [None]:
import openai
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

openai.api_key = ''


# Function to generate synthetic text data
def generate_synthetic_texts(num_samples, topics):
    texts = []
    samples_per_topic = num_samples // len(topics)

    for topic in topics:
        # Generate synthetic texts for each topic
        for _ in range(samples_per_topic):
            prompt = f"Generate a short text related to the topic '{topic}'. Generate the text in such a way that a customer is describing about an issue that is related to that topic."
            response = openai.ChatCompletion.create(
                model='gpt-4o-mini',
                messages=[{"role": "user", "content": prompt}]
            )
            texts.append(response.choices[0].message['content'].strip())

    return texts

# Function to get embeddings from OpenAI
def get_openai_embeddings(texts, model="text-embedding-3-small"):
    responses = openai.Embedding.create(
        model=model,
        input=texts
    )
    embeddings = [response['embedding'] for response in responses['data']]
    return np.array(embeddings)

# Function to evaluate clustering
def evaluate_clustering(embeddings, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, cluster_labels)
    davies_bouldin = davies_bouldin_score(embeddings, cluster_labels)
    return silhouette_avg, davies_bouldin

topics = ['Software Upgrade Assistance', 'Database and Report Issues', 'Technical Issue', 'Email Delivery Problem', 'Financial Reporting Discrepancies']
num_samples = 50
synthetic_texts = generate_synthetic_texts(num_samples, topics)

# Store results in a DataFrame
results = pd.DataFrame(columns=['Embedding Method', 'Silhouette Score', 'Davies-Bouldin Index'])

# Bag of Words Embedding
bov_vectorizer = CountVectorizer(stop_words='english')
bow_embeddings = bov_vectorizer.fit_transform(synthetic_texts).toarray()
silhouette_avg, davies_bouldin = evaluate_clustering(bow_embeddings, len(topics))
results = pd.concat([results, pd.DataFrame({'Embedding Method': 'Bag of Words',
                           'Silhouette Score': silhouette_avg,
                           'Davies-Bouldin Index': davies_bouldin}, index=[0])], ignore_index=True)

# TF-IDF Embedding
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_embeddings = tfidf_vectorizer.fit_transform(synthetic_texts).toarray()
silhouette_avg, davies_bouldin = evaluate_clustering(tfidf_embeddings, len(topics))
results = pd.concat([results, pd.DataFrame({'Embedding Method': 'TF-IDF',
                           'Silhouette Score': silhouette_avg,
                           'Davies-Bouldin Index': davies_bouldin}, index=[0])], ignore_index=True)

# OpenAI Embeddings (text-embedding-3-small)
openai_embeddings_small = get_openai_embeddings(synthetic_texts, model="text-embedding-3-small")
silhouette_avg, davies_bouldin = evaluate_clustering(openai_embeddings_small, len(topics))
results = pd.concat([results, pd.DataFrame({'Embedding Method': 'OpenAI text-embedding-3-small',
                           'Silhouette Score': silhouette_avg,
                           'Davies-Bouldin Index': davies_bouldin}, index=[0])], ignore_index=True)


# OpenAI Embeddings (text-embedding-3-large)
openai_embeddings_large = get_openai_embeddings(synthetic_texts, model="text-embedding-3-large")
silhouette_avg, davies_bouldin = evaluate_clustering(openai_embeddings_large, len(topics))
results = pd.concat([results, pd.DataFrame({'Embedding Method': 'OpenAI text-embedding-3-large',
                           'Silhouette Score': silhouette_avg,
                           'Davies-Bouldin Index': davies_bouldin}, index=[0])], ignore_index=True)


# OpenAI Embeddings (text-embedding-ada-002)
openai_embeddings_ada = get_openai_embeddings(synthetic_texts, model="text-embedding-ada-002")
silhouette_avg, davies_bouldin = evaluate_clustering(openai_embeddings_ada, len(topics))
results = pd.concat([results, pd.DataFrame({'Embedding Method': 'OpenAI text-embedding-ada-002',
                           'Silhouette Score': silhouette_avg,
                           'Davies-Bouldin Index': davies_bouldin}, index=[0])], ignore_index=True)


# Display the results
print(results)

                Embedding Method  Silhouette Score  Davies-Bouldin Index
0                   Bag of Words          0.189550              1.669081
1                         TF-IDF          0.133768              2.008565
2  OpenAI text-embedding-3-small          0.323370              1.149484
3  OpenAI text-embedding-3-large          0.372534              1.034745
4  OpenAI text-embedding-ada-002          0.322881              1.143056


In [None]:
def append_results(algorithm, silhouette_avg, davies_bouldin):
    global results
    new_row = {
        'Algorithm': algorithm,
        'Silhouette Score': silhouette_avg,
        'Davies-Bouldin Index': davies_bouldin
    }
    results = pd.concat([results, pd.DataFrame(new_row, index=[0])], ignore_index=True) # Use concat instead of append
    print(new_row)  # Print the results for the individual algorithm

In [None]:
def evaluate_kmeans(embeddings, num_clusters):
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, labels)
    davies_bouldin = davies_bouldin_score(embeddings, labels)
    append_results('KMeans', silhouette_avg, davies_bouldin)

def evaluate_dbscan(embeddings):
    from sklearn.cluster import DBSCAN
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    labels = dbscan.fit_predict(embeddings)
    num_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # Exclude noise
    silhouette_avg = silhouette_score(embeddings, labels) if num_clusters > 1 else -1
    davies_bouldin = davies_bouldin_score(embeddings, labels) if num_clusters > 1 else -1
    append_results('DBSCAN', silhouette_avg, davies_bouldin)

def evaluate_agglomerative(embeddings, num_clusters):
    from sklearn.cluster import AgglomerativeClustering
    agglomerative = AgglomerativeClustering(n_clusters=num_clusters)
    labels = agglomerative.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, labels)
    davies_bouldin = davies_bouldin_score(embeddings, labels)
    append_results('Agglomerative', silhouette_avg, davies_bouldin)

def evaluate_gmm(embeddings, num_clusters):
    from sklearn.mixture import GaussianMixture
    gmm = GaussianMixture(n_components=num_clusters)
    labels = gmm.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, labels)
    davies_bouldin = davies_bouldin_score(embeddings, labels)
    append_results('Gaussian Mixture', silhouette_avg, davies_bouldin)

def evaluate_affinity(embeddings):
    from sklearn.cluster import AffinityPropagation
    affinity = AffinityPropagation()
    labels = affinity.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, labels)
    davies_bouldin = davies_bouldin_score(embeddings, labels)
    append_results('Affinity Propagation', silhouette_avg, davies_bouldin)

def evaluate_spectral(embeddings, num_clusters):
    from sklearn.cluster import SpectralClustering
    spectral = SpectralClustering(n_clusters=num_clusters, affinity='nearest_neighbors')
    labels = spectral.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, labels)
    davies_bouldin = davies_bouldin_score(embeddings, labels)
    append_results('Spectral Clustering', silhouette_avg, davies_bouldin)

In [None]:
# Evaluate each algorithm

evaluate_kmeans(embeddings, len(topics))
evaluate_dbscan(embeddings)
evaluate_agglomerative(embeddings, len(topics))
evaluate_gmm(embeddings, len(topics))
evaluate_affinity(embeddings)
evaluate_spectral(embeddings, len(topics))

{'Algorithm': 'KMeans', 'Silhouette Score': 0.4273070698254832, 'Davies-Bouldin Index': 1.1459411652485025}
{'Algorithm': 'DBSCAN', 'Silhouette Score': 0.2695234602830947, 'Davies-Bouldin Index': 1.4207716851993024}
{'Algorithm': 'Agglomerative', 'Silhouette Score': 0.4273070698254832, 'Davies-Bouldin Index': 1.1459411652485023}
{'Algorithm': 'Gaussian Mixture', 'Silhouette Score': 0.4273070698254832, 'Davies-Bouldin Index': 1.1459411652485023}
{'Algorithm': 'Affinity Propagation', 'Silhouette Score': 0.45128870630634693, 'Davies-Bouldin Index': 1.0313267068679823}
{'Algorithm': 'Spectral Clustering', 'Silhouette Score': 0.4102546921658252, 'Davies-Bouldin Index': 1.176047817519145}
Final Results:
               Algorithm  Silhouette Score  Davies-Bouldin Index
0                 kmeans          0.427307              1.145941
1                 dbscan          0.269523              1.420772
2          agglomerative          0.427307              1.145941
3                    gmm         

In [None]:
# Display the results DataFrame
print("Final Results:")
print(results)

Final Results:
               Algorithm  Silhouette Score  Davies-Bouldin Index
0                 kmeans          0.427307              1.145941
1                 dbscan          0.269523              1.420772
2          agglomerative          0.427307              1.145941
3                    gmm          0.410255              1.176048
4               affinity          0.451289              1.031327
5                 KMeans          0.427307              1.145941
6                 DBSCAN          0.269523              1.420772
7          Agglomerative          0.427307              1.145941
8       Gaussian Mixture          0.427307              1.145941
9   Affinity Propagation          0.451289              1.031327
10                KMeans          0.427307              1.145941
11                DBSCAN          0.269523              1.420772
12         Agglomerative          0.427307              1.145941
13      Gaussian Mixture          0.408802              1.167911
14  Affini