In [9]:
import pandas as pd
from dotenv import load_dotenv
import os
import duckdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')


load_dotenv()
# Path for 1 million English sample questions
sample_path = os.getenv("DATA_SAMPLE")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Clustering on question content (English)

In [10]:
dataset = pd.read_parquet(sample_path)

# Drop duplicate questions
questions_raw = dataset.drop_duplicates(subset='question_content', keep='first')

# Random sample of rows, reproducible with random_state
questions = questions_raw.sample(n=100000, random_state=42)


### Text Processing

In [11]:
# Cleans common starting formatting for questions
def strip_prefixes(text):
    text = text.lower()  # lowercase for consistent matching

    # Remove leading numeric IDs
    text = re.sub(r'^\s*\d+\s+', '', text)

    # Remove multiple nested "Name asks:" or "A farmer asks:" patterns
    # This removes repeated patterns at the start of the string
    pattern_asks = re.compile(r'^(?:[a-z\s]+ asks:)+\s*', flags=re.IGNORECASE)
    text = pattern_asks.sub('', text)

    # Remove Q, Qn, Qn:, Q:, Q followed immediately by a word (like Qwhat)
    text = re.sub(r'^q[n]?\s*[:,-]?\s*', '', text)
    text = re.sub(r'^q(?=[a-z])', '', text)  # removes Q directly followed by a word

    # Remove "Reply Q", "Reply Qn", "Reply followed" boilerplate
    text = re.sub(r'^reply\s+q[n]?\s*[:,-]?\s*', '', text)
    text = re.sub(r'^reply\s+followed\s*', '', text)

    # Remove any remaining leading/trailing whitespace
    text = text.strip()

    return text

def clean_text(text):
    # run prefix stripper FIRST
    text = strip_prefixes(text)

    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\bq\b', '', text) # removes starting 'q' for questions
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text

questions['cleaned_text'] = questions['question_content'].apply(clean_text)


In [12]:
# Remove stop words and tokenize
stop_words = set(stopwords.words('english'))

def tokenize(text):
    return [word for word in text.split() if word not in stop_words]

questions['tokens'] = questions['cleaned_text'].apply(tokenize)

In [13]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

questions['tokens'] = questions['tokens'].apply(lemmatize_tokens)


In [14]:
questions['processed_text'] = questions['tokens'].apply(lambda tokens: ' '.join(tokens))

### Vectorization

In [15]:
# Use unigrams only since questions are short
vectorizer = TfidfVectorizer(
    max_features=5000,       # limit vocabulary size to reduce memory usage
    min_df=5,                # ignore words appearing in fewer than 5 questions
    max_df=0.7,              # ignore very common words
    ngram_range=(1,1)        # unigrams
)

tfidf_matrix = vectorizer.fit_transform(questions['processed_text'])
print(tfidf_matrix.shape)  # (num_questions, 5000)


(100000, 5000)


In [16]:
from sklearn.decomposition import TruncatedSVD

n_components = 100  # start with 100 latent dimensions
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)

print(svd_matrix.shape)  # (num_questions, 100)


(100000, 100)


In [17]:
import numpy as np

terms = np.array(vectorizer.get_feature_names_out())
for i, comp in enumerate(svd.components_):
    top_terms = terms[np.argsort(comp)[-10:]]  # top 10 words for this component
    print(f"Component {i}: {', '.join(top_terms)}")


Component 0: crop, good, season, seed, bean, tomato, type, best, maize, plant
Component 1: hole, right, rain, iwant, want, bean, carrot, season, banana, plant
Component 2: control, much, kg, fertilizer, seed, bean, planting, best, price, maize
Component 3: way, breed, planting, variety, soil, season, crop, type, tomato, best
Component 4: followed, reply, want, good, feed, dairy, plant, milk, get, cow
Component 5: bean, get, good, use, response, followed, type, reply, tomato, crop
Component 6: maize, response, followed, reply, long, take, grow, type, cow, crop
Component 7: bean, cause, control, use, optout, stop, best, response, followed, reply
Component 8: response, followed, chick, reply, bean, seed, market, best, crop, get
Component 9: price, many, soya, good, fertilizer, planting, banana, use, type, bean
Component 10: many, stop, optout, maize, seed, good, response, followed, reply, type
Component 11: followed, reply, good, milk, season, price, crop, tomato, cow, bean
Component 12: 

In [18]:
svd.explained_variance_ratio_.sum()


0.34433005617053897

In [19]:
from sklearn.cluster import KMeans

n_clusters = 10  # start with ~10 clusters, adjust after exploration
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(svd_matrix)

questions['cluster'] = labels


In [20]:
import numpy as np

# Get the cluster centers in the reduced SVD space
centers = kmeans.cluster_centers_

# Project centers back to TF-IDF space to identify top words
# (This uses the SVD components to approximate the original feature space)
terms = vectorizer.get_feature_names_out()
components = svd.components_

for i in range(n_clusters):
    # approximate cluster center in original space
    center_tfidf = centers[i].dot(components)

    top_indices = center_tfidf.argsort()[::-1][:15]
    top_terms = [terms[idx] for idx in top_indices]

    print(f"\nCluster {i}:")
    print(", ".join(top_terms))



Cluster 0:
hen, egg, lay, many, laying, chicken, give, layer, day, eat, take, hatch, kienyeji, eating, long

Cluster 1:
plant, maize, bean, tomato, want, season, type, good, seed, crop, potato, use, many, time, cabbage

Cluster 2:
banana, plantation, plant, wilt, bacteria, planting, control, use, type, best, good, disease, leaf, many, spacing

Cluster 3:
best, plant, maize, type, variety, planting, season, tomato, seed, way, fertilizer, crop, breed, bean, soil

Cluster 4:
much, kg, cost, maize, hw, per, one, bean, money, get, kilo, tomato, chick, kilogram, need

Cluster 5:
cow, milk, dairy, give, feed, heat, best, problem, produce, cause, birth, sign, many, salt, pregnant

Cluster 6:
crop, tomato, get, use, take, type, long, bean, animal, chicken, chick, many, good, pig, grow

Cluster 7:
reply, followed, response, optout, stop, plant, maize, best, cow, use, crop, bean, tomato, type, asks

Cluster 8:
maize, price, planting, seed, bean, type, use, control, good, farm, many, crop, fertil

In [25]:
questions.loc[questions['cluster'] == 7, 'question_content'].sample(30, random_state=42)


616256    Emma asks: pliz guyz is anyone with an idea on...
681152    Vincent asks: Does cabbages require any sprayi...
259659    Charles asks: How many varieties of banana so ...
919146    Stephen asks: Ama asks: what is the gestation ...
347713    A farmer asks: sugest a reason why i prefer to...
166604    Shamim asks: So how many meters shd be use whe...
594161    Q Aggrey asks: How comes that Gulu land does n...
84518     Josephine asks: What is the spacing of plantin...
866718    Nicholas asks: A farmer asks: What is the diff...
88892     Shivan asks: WHAT SHOULD I DO? ALL MY HENS&COC...
892303    A farmer asks: How many varieties of rabbits d...
433146    Q A farmer asks: Bt Hw Many Kg Can On One Hole...
237085    A farmer asks: what do we call the female shee...
833786    Pamela asks: can i get loan. Reply Q326 follow...
979736    Q which is the best medicine of chickens that ...
36965     Joel asks: my cabbeges r about 2 b ready bt le...
393370    Mary asks: What is aspergillas

In [26]:
# junk_terms = ["reply", "followed", "optout", "stop"]
junk_terms = ["Dady Wa Penny"]
mask = questions['question_content'].str.contains(
    '|'.join(junk_terms),
    case=False,
    na=False
)

junk_subset = questions[mask]
print(junk_subset.shape)


(2, 28)


In [27]:
for a in junk_subset['question_content']:
    print(a)

Dady Wa Penny asks: Hello wefarm,which type of plant can a poor man deep in the village plant on ahalf an acre and it can give good harvest and some mone
Dady Wa Penny asks: my calf dificat blood what causes that. Gd 9t Reply Q471 followed by your response.

optout stop 6333


In [28]:
for question in junk_subset.sample(30, random_state=42)['question_content']:
    print(question)


ValueError: Cannot take a larger sample than population when 'replace=False'