In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import duckdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
import nltk

# Import from local modules
import sys
sys.path.append('../src')

from cleaning import clean_text, tokenize, lemmatize_tokens

# Download necessary NLTK resources
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

# Path for 1 million English sample questions
load_dotenv()
sample_path = os.getenv("DATA_SAMPLE")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Clustering on question content (English)

### Prepare sample dataset

In [10]:
dataset = pd.read_parquet(sample_path)

# Drop duplicate questions
questions_raw = dataset.drop_duplicates(subset='question_content', keep='first')

# Random sample of rows, reproducible with random_state
questions = questions_raw.sample(n=100000, random_state=42)


### Text Processing

In [11]:
def process_text(text: str) -> str:
    cleaned = clean_text(text)
    tokens = tokenize(cleaned)
    lemmas = lemmatize_tokens(tokens)
    return " ".join(lemmas)

questions['processed_text'] = questions['question_content'].apply(process_text)

### Vectorization

In [12]:
# Use unigrams only since questions are short
vectorizer = TfidfVectorizer(
    max_features=5000,       # limit vocabulary size to reduce memory usage
    min_df=5,                # ignore words appearing in fewer than 5 questions
    max_df=0.7,              # ignore very common words
    ngram_range=(1,3)        # unigrams to bigrams
)

tfidf_matrix = vectorizer.fit_transform(questions['processed_text'])
print(tfidf_matrix.shape)  # (num_questions, 5000)


(100000, 5000)


In [13]:
import numpy as np

scalar = np.array(5)
vector = np.array([1, 2, 3])
matrix = np.array([[1, 2], [3, 4]])
tensor3 = np.random.randn(2, 3, 4)

print(scalar.ndim)   # 0
print(vector.ndim)   # 1
print(matrix.ndim)   # 2
print(tensor3.ndim)  # 3
print(tensor3)
print(matrix)


0
1
2
3
[[[-0.88479356 -2.18800026  0.05385855 -0.74653956]
  [-2.92511831  0.21169566  0.17527344  0.26825658]
  [-0.32001963 -0.9959532  -0.76953611  0.88065008]]

 [[ 0.98609026 -0.15849071  0.94032672 -1.4289246 ]
  [ 2.85920125  0.68471709  0.23028841 -0.75983649]
  [ 1.03438812  0.11245826 -1.75458845  0.22032017]]]
[[1 2]
 [3 4]]


In [14]:
from sklearn.decomposition import TruncatedSVD

n_components = 100  # start with 100 latent dimensions
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)

print(svd_matrix.shape)  # (num_questions, 100)


(100000, 100)


In [None]:


terms = np.array(vectorizer.get_feature_names_out())
for i, comp in enumerate(svd.components_):
    top_terms = terms[np.argsort(comp)[-10:]]  # top 10 words for this component
    print(f"Component {i}: {', '.join(top_terms)}")


Component 0: reply qnum_token, followed, qnum_token, type, tomato, reply, num_token, best, maize, plant
Component 1: season, seed, num_token, bean, type, tomato, plant maize, best, maize, plant
Component 2: qnum_token, response, followed response, qnum_token followed response, followed, reply qnum_token followed, reply qnum_token, qnum_token followed, plant maize, plant
Component 3: seed, maize seed, planting, bean, plant maize, price maize, crop, best, price, maize
Component 4: rotation, type crop, use, season, grow, soil, type, tomato, best, crop
Component 5: animal, crop, dairy cow, dairy, cause, plant, milk, maize, best, cow
Component 6: grow, rotation, many, plant, num_token, maize, cow, long, take, crop
Component 7: good, dairy, plant, best, get market, crop, seed, market, cow, get
Component 8: want, hen, disease, banana, poultry, control, animal, cause, use, num_token
Component 9: maize, disease, crop, much, cow, control, get, cause, use, tomato
Component 10: long, poultry, get,

In [16]:
svd.explained_variance_ratio_.sum()


0.2634808542488822

In [17]:
from sklearn.cluster import KMeans

n_clusters = 10  # start with ~10 clusters, adjust after exploration
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(svd_matrix)
# labels = kmeans.fit_predict(tfidf_matrix)

questions['cluster'] = labels


In [24]:
components

array([[ 6.43422339e-04,  5.46400547e-04,  7.90791975e-04, ...,
         1.88775162e-03,  1.11616951e-03,  8.77849127e-04],
       [ 2.61079688e-04,  2.15373044e-04,  3.98619747e-04, ...,
         9.86103551e-05,  7.52462333e-05,  4.96317860e-04],
       [-2.82451286e-04, -7.19259350e-04, -5.53282083e-04, ...,
        -4.54766501e-03, -2.89555505e-03, -5.16261928e-06],
       ...,
       [ 2.52454786e-04, -1.33521869e-03,  2.18140461e-04, ...,
         2.90958454e-03,  3.58260714e-03,  2.43563603e-03],
       [-2.21014677e-05, -2.00575098e-04, -5.92249639e-04, ...,
        -2.17844988e-03, -1.48048529e-03,  2.67350469e-03],
       [-7.64164683e-04,  1.35971949e-03,  8.91159368e-05, ...,
         3.34814461e-03,  2.75808046e-03,  1.72182639e-04]])

In [18]:
import numpy as np

# Get the cluster centers in the reduced SVD space
centers = kmeans.cluster_centers_

# Project centers back to TF-IDF space to identify top words
# (This uses the SVD components to approximate the original feature space)
terms = vectorizer.get_feature_names_out()
components = svd.components_

for i in range(n_clusters):
    # approximate cluster center in original space
    center_tfidf = centers[i].dot(components)

    top_indices = center_tfidf.argsort()[::-1][:15]
    top_terms = [terms[idx] for idx in top_indices]

    print(f"\nCluster {i}:")
    print(", ".join(top_terms))



Cluster 0:
egg, lay, hen, lay egg, egg plant, hen lay, chicken, layer, many, plant, laying, many egg, hen lay egg, eating, num_token

Cluster 1:
plant, best, tomato, crop, num_token, bean, use, banana, type, animal, chicken, take, cause, control, good

Cluster 2:
milk, cow, production, produce, milk production, cow produce, increase, dairy, produce milk, milk cow, goat, give, high, breed, calf

Cluster 3:
maize, plant, plant maize, best, maize plant, seed, price, price maize, type, planting, num_token, bean, maize seed, type maize, use

Cluster 4:
one, many, acre, plant, young, one acre, chick, young one, maize, num_token, old, much, day, best, one best

Cluster 5:
get, market, get market, seed, num_token, good, chick, get good, plant, seedling, get seed, want, county, much, maize

Cluster 6:
rabbit, market, urine, market rabbit, breed, get, rabbit urine, type rabbit, best, feed, disease, num_token, breed rabbit, young, food

Cluster 7:
cow, dairy, dairy cow, heat, feed, problem, best

In [19]:
terms = vectorizer.get_feature_names_out()
centers = kmeans.cluster_centers_

for i in range(n_clusters):
    top_indices = centers[i].argsort()[::-1][:15]  # top 15 words in TF-IDF space
    top_terms = [terms[idx] for idx in top_indices]
    print(f"\nCluster {i}:")
    print(", ".join(top_terms))



Cluster 0:
advisable plant, acalf, access, _how, acre, advisable num_token, acre farm, advisable plant maize, acow, afarmer control, abortion, advice give, afemale, afamer, administer

Cluster 1:
_how, abanana plant, abt, achicken, _what, abortion, acalf, acre maize, according, absoluta, achieve, ad, advice best, across, agoat

Cluster 2:
advisable use, abaut, advisable feed, advisable num_token, _how, actelic, advantage, advice give, able, administer, afemale, acre produce, afarm, adding, advisable plant

Cluster 3:
abanana, _how, _what, abaut, abig, absoluta, acre, acow, acaricide, access, actelic, advantage, act, ad, add

Cluster 4:
adairy cow, aday, _how, activity, acidic, acre piece land, adding, access, acalf, acrop, acre maize, _what, adairy, actelic, adisease

Cluster 5:
able, _how, absoluta, acaricide, abt, _what, acow, ad, aday, abig, advantage using, acidic, acid, adisease, advisable

Cluster 6:
adairy, advantage using, action, _how, administered, administer, advice best, a

In [20]:
questions.loc[questions['cluster'] == 7, 'question_content'].sample(30, random_state=42)


55671     Q If yes, why is it atvaseble to feed cows wit...
896503    I HAVE FARM OF ANIMAL I HAVE MANY COWS AND GOA...
787196    Q my cow was brown but its colour at the backl...
811648    Q,what Are The X-stic Of A Good Cow Which Can ...
513092                Q.what cause stunted growth in a cow?
99457            Q.my cow irinating blood.is there medicine
99856     Q how will I know expectant cow is well, which...
626529    Q: what kind of medicine do I give my cow whos...
670092                    Q what is gestation period of cow
474226    Q: my cow gave birth yesterday, and upto now i...
768977                       QWhat can i done to an old cow
492417         Q which type of feeds is best in dairy cow ,
364553    Hw can i know dat te dairy cow is suffering 4r...
912056    Q which type of medicine can i use to treat my...
47446     How should the discharge of a pregnant cow loo...
886100                      Q.How much is asack of cow peas
874609    Q my cow normally when its on 

In [21]:
from collections import Counter

# flatten into a single stream of tokens
all_tokens = [tok for toks in questions['tokens'] for tok in toks]

# look for tokens with ≤4 characters
unit_like = [tok for tok in all_tokens if len(tok) <= 2]

freq = Counter(unit_like)
freq.most_common(50)


KeyError: 'tokens'

In [None]:
units = {"kg","g","mg","ml","l","m","cm","km","ha","acre"}

freq = Counter([tok for tok in all_tokens if tok in units])
freq.most_common(50)

[('l', 1138),
 ('acre', 932),
 ('kg', 847),
 ('g', 163),
 ('cm', 47),
 ('ha', 20),
 ('ml', 19),
 ('km', 2)]

In [None]:
unit_like

['l',
 'iz',
 'da',
 'ov',
 'kg',
 'u',
 'z',
 'u',
 'z',
 'da',
 'ca',
 'kl',
 'u',
 'kg',
 'hw',
 'da',
 'pa',
 'hi',
 'ov',
 'b',
 'nt',
 'u',
 'hw',
 'te',
 'n',
 'te',
 'b',
 'iz',
 'mi',
 'ua',
 'na',
 'ur',
 'sh',
 'b',
 'te',
 'u',
 'nw',
 'ur',
 'c',
 'hw',
 'e',
 'e',
 'hw',
 'u',
 'e',
 'kg',
 'hw',
 'hw',
 'hw',
 'qi',
 'da',
 'u',
 'hw',
 'im',
 'co',
 'kg',
 'n',
 'u',
 'l',
 'n',
 'u',
 'u',
 'gd',
 'u',
 'te',
 'ö',
 'bt',
 'c',
 'go',
 'da',
 'z',
 'u',
 'u',
 'z',
 'dc',
 'l',
 'n',
 'wl',
 'te',
 'kj',
 'kg',
 'iz',
 'e',
 'u',
 'de',
 'mi',
 'ug',
 'mi',
 'wz',
 'yr',
 'u',
 'hw',
 'de',
 'de',
 'ov',
 'de',
 'hv',
 'n',
 'da',
 'u',
 'ov',
 'ov',
 'hw',
 'b',
 'kg',
 'g',
 'dy',
 'en',
 'en',
 'go',
 'us',
 'u',
 'iz',
 'wy',
 'u',
 'hw',
 'hw',
 'go',
 'u',
 'e',
 'us',
 'kg',
 'tn',
 'fo',
 'u',
 'z',
 'hw',
 'u',
 'cm',
 'l',
 'kg',
 'kg',
 'eg',
 'ft',
 'de',
 'de',
 'hw',
 'ft',
 'ni',
 'u',
 'za',
 'u',
 'za',
 'u',
 'ai',
 'e',
 'ph',
 'z',
 'u',
 'av',
 'ni

In [None]:
junk_terms = ["reply", "followed", "optout", "stop"]
# junk_terms = ["followed"]
mask = questions['question_content'].str.contains(
    '|'.join(junk_terms),
    case=False,
    na=False
)

junk_subset = questions[mask]
print(junk_subset.shape)


(3059, 28)


In [None]:
for a in junk_subset['question_content'].head(50):
    print(a)

QA farmer asks: Whats is the use of pigs fichtes in our farm? Reply Q348 followed by your response.

optout stop 6333
Now i dont have any poutry now which question am i going to reply
Edward asks: I would like to know the pineapple disease Reply Q39 followed by your response.

optout stop 6333
A farmer asks: Am planing 2grow millet in a swampy area. How can i grow it Reply Q571 followed by your response.
Joseph asks: where can i get/purchase a good pedigree dairy expectant heifer?..plse assist.thx. Reply Q14 followed by your response. at nandi county
Emma Asks:what Is Cover Crops? Reply P7 Followed By Your Response.
A farmer asks: which breed is the best of cow and where can i get Reply Q182 followed by your response.
Beatrice asks: How Can One Determine The Age Of A Tree Plant? Reply Q816 followed by your response.
Q A farmer asks: how do we plant passion fruits Reply Q477 followed by your response.

optout stop 6333
REPLY  Q  Can  l  use  antibiotic  to  treat  salmonellosis
Joel ask

In [None]:
for question in junk_subset.sample(30, random_state=42)['question_content']:
    print(question)


Robert asks: Somebody came to me looking for kienyeji 2days old chicks for sale andRobert asks: Somebody came to me looking for kienyeji 2days old chicks for sale and i have them, but the problem is price,how much should i sell a? Reply
Q. I have hens that hava stop laying what is solution? but l feed with commercil food,
Michael asks: how can you control marole in poultry? Reply Q1159 followed by your response.
Q I want to plant onions how is the procedure Reply Q45 followed by your response.
Q How to stop land slaind
Nicholas asks: A farmer asks: What is the difference btn food crops & cash crops? Reply Q56 followed by your response.
Q _my cow usualy eat polythene paperc. clothes. what wl u give it in order tw stop
Q it rained heavy on 23rd 24th Oct but now no signs do I continue planting or stop
Qn: If I want to plant on two arcers of land. I want to know the quantity of maize am supposed to buy and the expected yield if all goes well 

optout stop 6333
Muhumuza asks: at what stage 