In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import duckdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
import umap
import nltk
import re
from sentence_transformers import SentenceTransformer

# Import from local modules
import sys
sys.path.append('../src')

from cleaning import minimal_clean, clean_text, tokenize, lemmatize_tokens

# Download necessary NLTK resources
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

# Path for 1 million English sample questions
load_dotenv()
sample_path = os.getenv("DATA_SAMPLE")
full_data_path = os.getenv("DATA_PARQUET")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Clustering on question content (English)

### Prepare sample dataset

In [410]:
dataset = pd.read_parquet(sample_path)

# Drop duplicate questions
questions_raw = dataset.drop_duplicates(subset='question_content', keep='first')

# Random sample of rows, reproducible with random_state
questions = questions_raw.sample(n=100000, random_state=42)


In [529]:
questions_raw.shape

(737315, 24)

In [486]:
questions['question_topic']

725489         bird
302816        maize
388150         bean
49371          kale
293923       cattle
            ...    
441257         crop
566298         None
676477         None
787574          pig
656249    vegetable
Name: question_topic, Length: 100000, dtype: object

In [492]:
pd.unique(questions_raw['question_topic'])

array([None, 'plant', 'maize', 'cattle', 'bean', 'goat', 'duck',
       'pumpkin', 'animal', 'grass', 'chicken', 'rabbit', 'tomato',
       'melon', 'peanut', 'crop', 'coffee', 'paw-paw', 'wheat', 'millet',
       'poultry', 'onion', 'potato', 'apple', 'pig', 'bird', 'eucalyptus',
       'fish', 'aubergine', 'kale', 'livestock', 'dog', 'pigeon',
       'cabbage', 'banana', 'french-bean', 'sheep', 'avocado', 'bee',
       'cassava', 'capsicum', 'rice', 'pear', 'tea', 'cotton',
       'macademia', 'plantain', 'tobacco', 'yam', 'tree', 'taro',
       'radish', 'watermelon', 'greens', 'sweet-potato', 'vegetable',
       'olive', 'passion-fruit', 'napier-grass', 'pea', 'orange',
       'sunflower', 'carrot', 'lemon', 'miraa', 'cat', 'soya', 'lettuce',
       'ginger', 'cowpea', 'sugar-cane', 'pineapple', 'jackfruit',
       'mango', 'sisal', 'spinach', 'okra', 'mushroom', 'garlic',
       'turkey', 'peach', 'collard-greens', 'cereal', 'corriander',
       'butternut-squash', 'bamboo', 'beet

In [450]:
questions_raw.shape

(737315, 24)

In [534]:
for question in questions['question_content'].head(10):
    print(question)
    print("-----")

Q how can I do so as my birds can multiply  faster  as I use the traditional method  ? Or is it advisable immediately after hatching I separate the chicks and mother?
-----
How can l control maize stalk ?
-----
Q, Wht Iz Da Cost Ov 1 Kg Of Beans
-----
I have ready sukuma wiki for in siaya. Connect me with the buyers
-----
which is the best option between buying either of this grade cows(not pedigrees):18month old heifer, mature heifer,incalf heifer or lactating dairy cow?
-----
I Know to plant banana plantation
-----
What is the best soil to plant tobacco
-----
The best acaricide to spray cows is ALMATICKS
-----
Which disease for hens is characterized by crying eyes and wounds around the head
-----
Q HOW MENY TYPES OF MILET DO WE HAVE? ASK ME ABOUT TOMATO.
-----


In [414]:
# Case-insensitive filtering
filtered_questions = questions.loc[
    questions['question_content'].str.contains("asks", case=False, na=False)
]

# Get the first N questions that contain "asks"
N = 30
sample_questions = filtered_questions.head(N)

# Print them
for question in sample_questions['question_content']:
    print(question)
    print("-----")


QA farmer asks: Whats is the use of pigs fichtes in our farm? Reply Q348 followed by your response.

optout stop 6333
-----
Edward asks: I would like to know the pineapple disease Reply Q39 followed by your response.

optout stop 6333
-----
A farmer asks: Am planing 2grow millet in a swampy area. How can i grow it Reply Q571 followed by your response.
-----
Joseph asks: where can i get/purchase a good pedigree dairy expectant heifer?..plse assist.thx. Reply Q14 followed by your response. at nandi county
-----
Emma Asks:what Is Cover Crops? Reply P7 Followed By Your Response.
-----
A farmer asks: which breed is the best of cow and where can i get Reply Q182 followed by your response.
-----
Beatrice asks: How Can One Determine The Age Of A Tree Plant? Reply Q816 followed by your response.
-----
Q A farmer asks: how do we plant passion fruits Reply Q477 followed by your response.

optout stop 6333
-----
Joel asks: HOW MANY MONTH THAT IS SUPPOSSED TO HARVEST CABBAGES Reply Q293 followed by

### Text Processing

In [415]:
# Import from local modules

def process_text(text: str) -> str:
    cleaned = clean_text(text)
    tokens = tokenize(cleaned)
    lemmas = lemmatize_tokens(tokens)
    return " ".join(lemmas)

questions['processed_text'] = questions['question_content'].apply(process_text)

### Vectorization

In [416]:
# Use unigrams only since questions are short
vectorizer = TfidfVectorizer(
    max_features=5000,       # limit vocabulary size to reduce memory usage
    min_df=5,                # ignore words appearing in fewer than 5 questions
    max_df=0.7,              # ignore very common words
    ngram_range=(1,2)        # unigrams to bigrams
)

tfidf_matrix = vectorizer.fit_transform(questions['processed_text'])
print(tfidf_matrix.shape)  # (num_questions, 5000)


(100000, 5000)


In [417]:


n_components = 200  # start with 100 latent dimensions
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)


In [418]:
svd.explained_variance_ratio_.sum()


0.3625956688759838

### K-means Clustering

In [419]:
n_clusters = 10  # start with ~10 clusters, adjust after exploration
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(svd_matrix)

questions['cluster'] = labels


In [420]:
# Get the cluster centers in the reduced SVD space
centers = kmeans.cluster_centers_

# Project centers back to TF-IDF space to identify top words
# (This uses the SVD components to approximate the original feature space)
terms = vectorizer.get_feature_names_out()
components = svd.components_

for i in range(n_clusters):
    # approximate cluster center in original space
    center_tfidf = centers[i].dot(components)

    top_indices = center_tfidf.argsort()[::-1][:15]
    top_terms = [terms[idx] for idx in top_indices]

    print(f"\nCluster {i}:")
    print(", ".join(top_terms))



Cluster 0:
hen, egg, lay, hen lay, lay egg, num_token, many, laying, give, many egg, layer, problem, hatch, cock, day

Cluster 1:
cow, milk, dairy, dairy cow, give, feed, best, problem, heat, num_token, cause, produce, sign, birth, cow produce

Cluster 2:
animal, cause, disease, feed, dairy, dairy animal, num_token, farm, use, best, give, animal feed, tick, like, production

Cluster 3:
one, many, acre, plant, chick, one acre, maize, num_token, much, old, young one, day, young, one best, get

Cluster 4:
season, plant, best, rainy, rainy season, dry season, plant season, dry, best season, maize, crop, planting, tomato, type, season plant

Cluster 5:
take, long, mature, long take, take mature, many, take long, month, day, many month, many day, ready, harvest, harvested, birth

Cluster 6:
maize, best, num_token, crop, get, tomato, use, type, bean, much, chicken, control, price, poultry, farm

Cluster 7:
buy, num_token, want, want buy, get, much, money, maize, num_token buy, buy maize, see

In [421]:
# print sample questions from a cluster to manually inspect clustering / cleaning
samplecleaning = questions.loc[questions['cluster'] == 0, ['question_content','processed_text']].sample(30, random_state=42)
for a, q in samplecleaning.iterrows():
    print(a)
    print(q['question_content'])
    print(q['processed_text'])
    print("-"*80)


160140
Q how many eggs does a egg hen give a day. Not local ones
q many egg egg hen give day local one
--------------------------------------------------------------------------------
24987
I   have   kienyegi  hens   i  have  feed  them  for  4months  but  their  havenot start  layers  eggs  what  is  the  problem,?
kienyegi hen feed num_tokenmonths havenot start layer egg problem
--------------------------------------------------------------------------------
620021
: My hen has a problem hanging feathers as if imevaa koti medicine n gn
hen problem hanging feather imevaa koti medicine n gn
--------------------------------------------------------------------------------
684808
Q. Which drug can be given to hens with swollen heads in apoultry house.
q drug given hen swollen head apoultry house
--------------------------------------------------------------------------------
75507
Hens And Rabbits Which Is The Best To Keep
hen rabbit best keep
--------------------------------------------

In [None]:
# test k number
## Previously tested 5 to 20, the silhouette scores for all were terrible
k_values = range(5, 6) 

X = svd_matrix

# Store metrics
results = []

for k in k_values:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X)
    
    # Compute metrics
    sil_score = silhouette_score(X, labels)
    db_score = davies_bouldin_score(X, labels)  # X.toarray() needed for DB score
    
    results.append({
        "k": k,
        "silhouette": sil_score,
        "davies_bouldin": db_score
    })
    
    print(f"\n=== k = {k} ===")
    print(f"Silhouette: {sil_score:.4f}, Davies-Bouldin: {db_score:.4f}")
    

# Optional: put metrics into a DataFrame for easier plotting
metrics_df = pd.DataFrame(results)
print("\nSummary metrics per k:")
print(metrics_df)



=== k = 5 ===
Silhouette: 0.0083, Davies-Bouldin: 4.2739

Summary metrics per k:
   k  silhouette  davies_bouldin
0  5     0.00834          4.2739


### HDBSCAN on TF-IDF + SVD

In [None]:
## Commented out HDBSCAN clustering to prevent long runtime
# hdb = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom')
# hdb_labels = hdb.fit_predict(svd_matrix)


In [None]:
len(hdb_labels)

100000

In [None]:
import pickle

# Save
with open('hdbscan_model.pkl', 'wb') as f:
    pickle.dump(hdb, f)

In [None]:
# Load tomorrow
with open('hdbscan_model.pkl', 'rb') as f:
    hdb = pickle.load(f)

In [425]:
import pandas as pd

df = pd.DataFrame({'cluster': hdb_labels})
print(df['cluster'].value_counts())


cluster
-1      81715
 379     2178
 363      442
 370      374
 368      282
        ...  
 143       15
 106       15
 76        15
 192       15
 49        15
Name: count, Length: 382, dtype: int64


### HDBSCAN on dense embeddings

In [426]:
# Apply separate cleaning function that just normalizes spacing and removes very common "Q" boilerplate
questions['Q_basic_clean'] = questions['question_content'].apply(minimal_clean)

In [427]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # smaller, faster for your dataset

texts = questions['Q_basic_clean'].tolist()
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)  # CPU batching

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 1563/1563 [03:01<00:00,  8.60it/s]


In [530]:
# # UMAP (noise ratio 31.38% with 180 clusters)
# umap_model = umap.UMAP(
#     n_neighbors=30,      
#     n_components=5,      
#     metric='cosine',    
#     random_state=42
# )
# umap_embeddings = umap_model.fit_transform(embeddings)

# # HDBSCAN
# clusterer = HDBSCAN(
#     min_cluster_size=100,   
#     min_samples=10,       
#     metric='euclidean',   
#     cluster_selection_method='eom'  
# )

# cluster_labels = clusterer.fit_predict(umap_embeddings)
# questions['cluster'] = cluster_labels

# # Check proportion of observations in noise cluster (-1)
# noise_ratio = (cluster_labels == -1).mean()
# print(f"Noise ratio: {noise_ratio:.2%}")

# UMAP
umap_model = umap.UMAP(
    n_neighbors=60,      
    n_components=10,      
    metric='cosine',    
    random_state=42
)
umap_embeddings = umap_model.fit_transform(embeddings)

# HDBSCAN
clusterer = HDBSCAN(
    min_cluster_size=250,   
    min_samples=10,       
    metric='euclidean',   
    cluster_selection_method='eom'  
)

cluster_labels = clusterer.fit_predict(umap_embeddings)
questions['cluster'] = cluster_labels

# Check proportion of observations in noise cluster (-1)
noise_ratio = (cluster_labels == -1).mean()
print(f"Noise ratio: {noise_ratio:.2%}")

Noise ratio: 26.93%


In [531]:
len(pd.unique(questions['cluster']))

65

In [532]:
# Filter out noise points
mask = cluster_labels != -1
embeddings_nonnoise = umap_embeddings[mask]
labels_nonnoise = cluster_labels[mask]

# Compute cluster coherence (mean pairwise cosine similarity per cluster)
cluster_ids = np.unique(labels_nonnoise)
cluster_coherence = {}

for cid in cluster_ids:
    idx = labels_nonnoise == cid
    cluster_emb = embeddings_nonnoise[idx]
    
    if cluster_emb.shape[0] == 1:
        cluster_coherence[cid] = 1.0
    else:
        sim_matrix = cosine_similarity(cluster_emb)
        n = sim_matrix.shape[0]
        upper_tri = sim_matrix[np.triu_indices(n, k=1)]
        cluster_coherence[cid] = upper_tri.mean()

# Convert to DataFrame
coherence_df = pd.DataFrame({
    'cluster': list(cluster_coherence.keys()),
    'coherence': list(cluster_coherence.values()),
    'size': [np.sum(labels_nonnoise == cid) for cid in cluster_ids]
}).sort_values('coherence', ascending=True)

# Prints least coherent clusters
print(coherence_df.head(10))

# weighted coherence
total_points = coherence_df['size'].sum()
weighted_coherence = (coherence_df['coherence'] * coherence_df['size']).sum() / total_points
print(f"Weighted dataset-level coherence: {weighted_coherence:.6f}")



    cluster  coherence   size
38       38   0.997099   9602
25       25   0.997287  12903
26       26   0.998470   5364
42       42   0.999130   1216
8         8   0.999146   3431
53       53   0.999201   1446
31       31   0.999286    588
5         5   0.999303   1866
2         2   0.999326   2570
27       27   0.999400   1546
Weighted dataset-level coherence: 0.998775


In [None]:
top_n = 20  # number of clusters to inspect
examples_per_cluster = 5  # number of questions to show per cluster

# Get clusters sorted by size
cluster_counts = questions['cluster'].value_counts()
top_clusters = cluster_counts.head(top_n).index.tolist()

for cluster_id in top_clusters:
    print(f"\n--- Cluster {cluster_id} (size={cluster_counts[cluster_id]}) ---")
    cluster_questions = questions[questions['cluster'] == cluster_id]['Q_basic_clean'].head(examples_per_cluster)
    for q in cluster_questions:
        print("-", q)



--- Cluster -1 (size=26932) ---
- What is the best soil to plant tobacco
- who know the the market of cow peas and what is the cost of 1kg?
- give me the difference btn sugar cane and maize?
- what good animal saveif at hot area
- CAN I GET MAIZE MARKET

--- Cluster 25 (size=12903) ---
- how can I do so as my birds can multiply faster as I use the traditional method ? Or is it advisable immediately after hatching I separate the chicks and mother?
- Which disease for hens is characterized by crying eyes and wounds around the head
- mr.Kameno keep chickens,without feather in the neck.How do you call,such chicken in vernacular?
- I,ASK. my chicken needs to be'deworm which medicine did i. used
- What Medicine A farmer should use to prevent newcatle disease in poultry

--- Cluster 38 (size=9602) ---
- which is the best option between buying either of this grade cows(not pedigrees):18month old heifer, mature heifer,incalf heifer or lactating dairy cow?
- The best acaricide to spray cows is 