<a href="https://colab.research.google.com/github/razvanantonescu/seo-keyword-clustering/blob/main/Keyword_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import csv
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Read keywords from text file
with open("keywords.txt", "r") as f:
    keywords = f.read().splitlines()

# Create a Tf-idf representation of the keywords
vectorizer = TfidfVectorizer(stop_words='english') # Adding stop words can improve topic quality
X = vectorizer.fit_transform(keywords)

# Perform Affinity Propagation clustering
af = AffinityPropagation().fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

# Get the number of clusters found
n_clusters = len(cluster_centers_indices)

# Group keywords by cluster
clustered_keywords = [[] for _ in range(n_clusters)]
for i, label in enumerate(labels):
    clustered_keywords[label].append(keywords[i])

# Apply NMF to each cluster to get topic names
n_top_words = 3  # You can adjust this to get more or fewer words for the topic name
topic_names = []
for i in range(n_clusters):
      if clustered_keywords[i]:
        cluster_text = [" ".join(clustered_keywords[i])] # Combine keywords in a cluster into a single "document"
        cluster_vectorizer = TfidfVectorizer(stop_words='english')
        cluster_tfidf = cluster_vectorizer.fit_transform(cluster_text)
        if cluster_tfidf.shape[1] > 0: # Ensure there are words to model
            nmf = NMF(n_components=1, random_state=42, max_iter=300, alpha_W=0.00001, l1_ratio=0.5).fit(cluster_tfidf) # n_components=1 to get one topic per cluster
            feature_names = cluster_vectorizer.get_feature_names_out()
            top_words_indices = nmf.components_[0].argsort()[:-n_top_words - 1:-1]
            topic_words = [feature_names[i] for i in top_words_indices]
            topic_names.append(" ".join(topic_words))
        else:
            topic_names.append(f"Cluster {i+1} (Empty)") # Handle empty clusters
      else:
        topic_names.append(f"Cluster {i+1} (No keywords)") # Handle clusters with no keywords


# Write the clusters to a csv file with meaningful names
with open("clusters_with_topics.csv", "w", newline="") as f: # Keep the file open throughout writing
    writer = csv.writer(f)
    writer.writerow(["Cluster Name", "Keyword"])
    for i in range ( n_clusters ) :
        cluster_keywords = [ keywords [ j ] for j in range ( len ( labels ) ) if labels [ j ] == i ]
        if cluster_keywords :
            for keyword in cluster_keywords :
                writer.writerow ( [ topic_names [ i ] , keyword ] )
        else :
            writer.writerow ( [ topic_names [ i ] , "No keywords in this cluster" ] )
print("Keyword clustering with topic names complete! Check 'clusters_with_topics.csv'")



Keyword clustering with topic names complete! Check 'clusters_with_topics.csv'




In [None]:
!pip install scikit-learn