In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Clustering on question content (English)

In [4]:
dataset = pd.read_csv('data/farmers_1mil_eng_sample.csv', index_col = 0)
# Drop duplicates for questions
questions_raw = dataset.drop_duplicates(subset='question_content', keep='first')

In [5]:
# Random sample of rows, reproducible with random_state
questions = questions_raw.sample(n=100000, random_state=42)


### Text Processing

In [6]:
# Cleans common starting formatting for questions
def strip_prefixes(text):
    text = text.lower()  # lowercase for consistent matching

    # Remove leading numeric IDs
    text = re.sub(r'^\s*\d+\s+', '', text)

    # Remove multiple nested "Name asks:" or "A farmer asks:" patterns
    # This removes repeated patterns at the start of the string
    pattern_asks = re.compile(r'^(?:[a-z\s]+ asks:)+\s*', flags=re.IGNORECASE)
    text = pattern_asks.sub('', text)

    # Remove Q, Qn, Qn:, Q:, Q followed immediately by a word (like Qwhat)
    text = re.sub(r'^q[n]?\s*[:,-]?\s*', '', text)
    text = re.sub(r'^q(?=[a-z])', '', text)  # removes Q directly followed by a word

    # Remove "Reply Q", "Reply Qn", "Reply followed" boilerplate
    text = re.sub(r'^reply\s+q[n]?\s*[:,-]?\s*', '', text)
    text = re.sub(r'^reply\s+followed\s*', '', text)

    # Remove any remaining leading/trailing whitespace
    text = text.strip()

    return text

def clean_text(text):
    # run prefix stripper FIRST
    text = strip_prefixes(text)

    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\bq\b', '', text) # removes starting 'q' for questions
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text

questions['cleaned_text'] = questions['question_content'].apply(clean_text)


In [7]:
# Remove stop words and tokenize
stop_words = set(stopwords.words('english'))

def tokenize(text):
    return [word for word in text.split() if word not in stop_words]

questions['tokens'] = questions['cleaned_text'].apply(tokenize)

In [8]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

questions['tokens'] = questions['tokens'].apply(lemmatize_tokens)


In [9]:
questions['processed_text'] = questions['tokens'].apply(lambda tokens: ' '.join(tokens))

### Vectorization

In [10]:
# Use unigrams only since questions are short
vectorizer = TfidfVectorizer(
    max_features=5000,       # limit vocabulary size to reduce memory usage
    min_df=5,                # ignore words appearing in fewer than 5 questions
    max_df=0.7,              # ignore very common words
    ngram_range=(1,1)        # unigrams
)

tfidf_matrix = vectorizer.fit_transform(questions['processed_text'])
print(tfidf_matrix.shape)  # (num_questions, 5000)


(100000, 5000)


In [11]:
from sklearn.decomposition import TruncatedSVD

n_components = 100  # start with 100 latent dimensions
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)

print(svd_matrix.shape)  # (num_questions, 100)


(100000, 100)


In [12]:
import numpy as np

terms = np.array(vectorizer.get_feature_names_out())
for i, comp in enumerate(svd.components_):
    top_terms = terms[np.argsort(comp)[-10:]]  # top 10 words for this component
    print(f"Component {i}: {', '.join(top_terms)}")


Component 0: crop, good, season, seed, bean, tomato, type, best, maize, plant
Component 1: tree, rain, right, onion, cassava, carrot, season, want, banana, plant
Component 2: pest, control, kg, fertilizer, best, bean, seed, planting, price, maize
Component 3: way, breed, planting, soil, variety, season, crop, type, tomato, best
Component 4: response, followed, reply, feed, good, dairy, milk, plant, get, cow
Component 5: breed, feed, dairy, maize, milk, take, long, plant, best, cow
Component 6: want, kg, one, price, good, much, market, seed, tomato, get
Component 7: well, give, season, many, soil, grow, cow, good, crop, type
Component 8: spray, disease, chemical, cause, control, type, maize, use, cow, tomato
Component 9: take, optout, cabbage, soil, seed, response, followed, reply, good, type
Component 10: chick, control, much, many, chicken, egg, hen, banana, use, bean
Component 11: soya, season, tomato, planting, kg, crop, price, much, cow, bean
Component 12: fertilizer, wilt, long, g

In [13]:
svd.explained_variance_ratio_.sum()


np.float64(0.3435559339775962)

In [14]:
from sklearn.cluster import KMeans

n_clusters = 10  # start with ~10 clusters, adjust after exploration
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(svd_matrix)

questions['cluster'] = labels


In [15]:
import numpy as np

# Get the cluster centers in the reduced SVD space
centers = kmeans.cluster_centers_

# Project centers back to TF-IDF space to identify top words
# (This uses the SVD components to approximate the original feature space)
terms = vectorizer.get_feature_names_out()
components = svd.components_

for i in range(n_clusters):
    # approximate cluster center in original space
    center_tfidf = centers[i].dot(components)

    top_indices = center_tfidf.argsort()[::-1][:15]
    top_terms = [terms[idx] for idx in top_indices]

    print(f"\nCluster {i}:")
    print(", ".join(top_terms))



Cluster 0:
cow, maize, get, type, much, good, tomato, cause, disease, chicken, give, bean, chick, animal, one

Cluster 1:
take, long, mature, many, harvest, day, month, harvested, ready, birth, maize, cow, grow, tomato, give

Cluster 2:
crop, plant, best, rotation, season, grow, type, grown, maize, pest, soil, area, well, production, good

Cluster 3:
hen, egg, lay, many, laying, chicken, eating, day, hatch, give, problem, cause, layer, eat, keep

Cluster 4:
best, maize, plant, type, tomato, variety, seed, planting, season, breed, way, fertilizer, cow, bean, time

Cluster 5:
plant, maize, tomato, bean, want, banana, season, good, type, seed, potato, many, one, time, cabbage

Cluster 6:
reply, followed, response, optout, stop, plant, maize, best, cow, type, use, tomato, asks, crop, bean

Cluster 7:
use, chemical, spray, maize, fertilizer, medicine, method, type, plant, tomato, manure, bean, best, control, treat

Cluster 8:
tree, plant, mango, seedling, many, coffee, dry, garden, suitabl

In [16]:
questions.loc[questions['cluster'] == 6, 'question_content'].sample(30, random_state=42)


521258    Fred asks: When onion seedlings are destroyed ...
991463    A farmer asks: Kibet asks: What is the price o...
183745    Ronald asks: The advise about the drying of my...
315347    Mutebi asks,' in case afarmer follow all the r...
980547    Twinamatsiko asks: which maize varieties are h...
123305    A farmer asks: My maize has been seriously att...
891384    A farmer asks: what is the price of ayellow be...
269092    Betty asks: Why should legumes seeds be inocul...
664666    Farida asks: how is avocado trees crafted Repl...
279639    Joram asks: when is spraying maize crops with ...
153240    Matsiko asks: Which drug is given to a cow saf...
476727    Fancy asks: What is the botanical name of maiz...
418681    James asks: What is science tecnology? Reply Q...
482286    Amon asks: Mention 4 methods of controlling so...
886358    Q sodom asks: My maize plantation has earthwor...
704857    <6333>,  A farmer asks: How Long Does It Take ...
746521    Mark asks: but wat is de uze o

In [29]:
# junk_terms = ["reply", "followed", "optout", "stop"]
junk_terms = ["Dady Wa Penny"]
mask = questions['question_content'].str.contains(
    '|'.join(junk_terms),
    case=False,
    na=False
)

junk_subset = questions[mask]
print(junk_subset.shape)


(1, 28)


In [30]:
for a in junk_subset['question_content']:
    print(a)

Dady Wa Penny asks: How many months does a broiler take to be sold at 40000 0757184764Reply Q542 followed by your response.

optout stop 6333


In [25]:
for question in junk_subset.sample(30, random_state=42)['question_content']:
    print(question)


ValueError: Cannot take a larger sample than population when 'replace=False'