In [113]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import duckdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
import umap
import nltk
import re
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns

# Import from local modules
import sys
sys.path.append('../src')
import processing_and_visualization
import importlib
importlib.reload(processing_and_visualization)

from cleaning import minimal_clean
from clustering_analysis import cluster_with_umap_hdbscan, recluster_noise, print_cluster_examples, summarize_clusters, metacluster_preview
from processing_and_visualization import quick_save_file, save_topic_files, save_question_clusters

# Download necessary NLTK resources
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

# Path for 1 million English sample questions
load_dotenv()
coretopics_path = os.getenv("DATA_CORETOPICS")
coretopics_embeddings_path = os.getenv("DATA_CORETOPICS_EMBEDDINGS")
data_dir = os.getenv("DATA_DIR")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruggb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# load data
questions = pd.read_parquet(coretopics_path)

In [6]:
# Apply separate cleaning function that just normalizes spacing and removes very common "Q" boilerplate
questions['Q_basic_clean'] = questions['question_content'].apply(minimal_clean)

In [7]:
# Check if embeddings file exists

if os.path.exists(coretopics_embeddings_path):
    print("Embedding file exists — loading from disk.")
    data = np.load(coretopics_embeddings_path, allow_pickle=True)
    embeddings = data["embeddings"]
    question_ids = data["question_ids"]  # shouldn't be necessary, but saved just in case of a mix-up

else:
    print("Embedding file not found — generating embeddings.")
    model = SentenceTransformer('all-MiniLM-L6-v2')

    texts = questions['Q_basic_clean'].tolist()
    question_ids = questions['question_id'].to_numpy()

    embeddings = model.encode(
        texts,
        batch_size=64,
        show_progress_bar=True
    )
# Save embeddings to disk along with question IDs in case of mixup
    np.savez_compressed(coretopics_embeddings_path, embeddings=embeddings, question_ids=question_ids)
    print("Saved embeddings to disk.")

questions['embedding'] = list(embeddings)

Embedding file exists — loading from disk.


In [40]:
questions['question_topic'].value_counts()

question_topic
chicken    251160
cattle     238816
maize      184029
tomato     126708
Name: count, dtype: int64

### "Chicken" questions analysis

In [None]:
chicken_df, chicken_umap, chicken_clusterer = cluster_with_umap_hdbscan(
    df=questions[questions['question_topic'] == 'chicken'],
    sample_size=1_000_000,
    umap_params={
        "n_neighbors": 30,
        "n_components": 5,
        "metric": "cosine",
    },
    hdbscan_params={
        "min_cluster_size": 750,
        "min_samples": 10,
        "metric": "euclidean"
    }
)


Finished clustering in 711.0 seconds
Noise ratio: 27.89%
Clusters found: 77
Approx. silhouette score (excluding noise): 0.551


In [10]:
print_cluster_examples(chicken_df, exclude_noise=False)


--- Cluster -1 (size=70059) ---
- How can i start with one hen which is a layer to lay fertilized eggs by a cock to grow into a big hen farm project?
- What enables a hen 2 lay an egg wiz 2 yorks?
- what makes an egg break inside the hen?
- Is my hen normal when it 'family plans' 4 3 yearz?
- inform me of grade Chicken and their feeds, how to get, where and the cost

--- Cluster 0 (size=11062) ---
- which vacines do we give to day old chicks and hw often do i vacinate them?
- #I have vaccinated my indigenous chicken against newcastle diseases but all over asudden, they are dying one by one,wat could be the problem?
- -i want to chicken vaccination medicines and what they prevent
- I request to know which vaccines I give to my kienyeji chicken
- wot is th best vaccine to a one day old chick

--- Cluster 7 (size=7428) ---
- -How long do broilers take to be slaugthered/ ready for meat?
- How Long Can A Small Chick Take In Order To Be A Layer Or A Broiler?
- what do you prefer to give to 

In [12]:
## Runs HDBSCAN again on the noise cluster to extract more clusters
# Adjust paramaters for smaller subset of data
hdbscan_params_noise = {
    "min_cluster_size": 200,
    "min_samples": 10,
    "metric": "euclidean"
}

chicken_noise_labels_shifted, chicken_noise_clusterer = recluster_noise(
    umap_embeddings=chicken_umap,
    labels=chicken_df["cluster"].to_numpy(),
    hdbscan_params=hdbscan_params_noise
)

Reclustered noise points: 70059
New noise ratio: 49.05%
Clusters found in noise: 57


In [13]:
## Preview new clusters obtained from noise cluster for quick quality check

# Select only the original noise rows
chicken_noise_mask = chicken_df['cluster'] == -1
chicken_noise_df = chicken_df.loc[chicken_noise_mask, ['Q_basic_clean']].copy()

# Assign the new noise cluster labels (length matches noise_df)
chicken_noise_df['new_cluster'] = chicken_noise_labels_shifted

print_cluster_examples(chicken_noise_df, text_column='Q_basic_clean', cluster_column='new_cluster',examples_per_cluster=5)


--- Cluster 105 (size=3973) ---
- can l get the contact of the chick farmer
- Wefarm.A farmer what to know the price of one chick each
- HOW CAN I TREAT MY CHICKS FROM DAY ONE TO THREE MONTHS?
- how can i keep my chick healthy
- how can i keep my chicks healthy?

--- Cluster 108 (size=3071) ---
- HOW WILL I TAKE CARE OF MY CHICKEN
- what are disadvantages of brooting a chicken at a high temperature and low humitity
- AS ME ABOUT CHICKENS
- How can I keep local chicken in big numbers
- #Q what do you mean by a chicken contracted farmer,

--- Cluster 106 (size=2975) ---
- Which breed of chicken is the best between broilers ,layers and kienyeji?
- I would like to keep kenbruw chicken were to get them here in Nakuru?
- Iwant to start poultry which one is best between layers,broiler and kienyeji
- Hello wefarm,I want to start farming chicken(kienyenji),how many chicks should I start with?
- UESTION,5:My kienyeji kuku can not lay eggs anymore they want to hatch on there eggs, which by now I

In [None]:
# Integrates new noise cluster labels back into the main dataframe
chicken_df.loc[chicken_df["cluster"] == -1, "cluster"] = chicken_noise_labels_shifted

In [42]:
chicken_df

Unnamed: 0,question_id,question_user_id,question_language,question_content,question_topic,question_sent,response_id,response_user_id,response_language,response_content,...,response_user_status,response_user_country_code,response_user_gender,response_user_dob,response_user_created_at,Q_basic_clean,embedding,cluster,meta_label,meta_label_title
0,4107104,574194,eng,Q How can i start with one hen which is a laye...,chicken,2017-12-08 19:04:04+00,4215434,574940,eng,Q1 they should answer as hurry as possible.,...,zombie,ug,,,2017-12-08 18:33:04+00,How can i start with one hen which is a layer ...,"[-0.003550073, -0.0744741, -0.031413864, -0.04...",-1,-1.0,Uncategorized
1,4107163,421585,eng,QHOW MUCH WILL I SALE CHICKS OF ONE MONTH,chicken,2017-12-08 19:09:12+00,4111387,336159,eng,Q20 sell for 300 each,...,zombie,ke,,,2017-09-07 06:43:38+00,HOW MUCH WILL I SALE CHICKS OF ONE MONTH,"[0.035664577, -0.0932317, -0.050379522, 0.0219...",68,1.0,Chick Care & Raising
2,4107198,574940,eng,"Q my hens are ready 2 lay,how many eggs shoul...",chicken,2017-12-08 19:13:47+00,4107991,477842,eng,Q8 12 Eggs,...,destroyed,ug,,,2017-10-30 18:08:58+00,"my hens are ready 2 lay,how many eggs should i...","[0.06327512, -0.09324382, -0.024945298, -0.043...",40,6.0,Eggs & Reproduction
3,4107224,361345,eng,where can l get kienyeji chick,chicken,2017-12-08 19:17:12+00,4107935,104186,eng,Q72 WESTERN,...,zombie,ke,male,1984-06-02,2016-10-06 04:55:22+00,where can l get kienyeji chick,"[-0.061376136, -0.08051571, 0.0023512514, -0.0...",67,1.0,Chick Care & Raising
4,4107239,574940,eng,Q What enables a hen 2 lay an egg wiz 2 yorks?,chicken,2017-12-08 19:18:57+00,4110884,470960,eng,Q15 Two Yolks Released In Oviduct,...,destroyed,ug,,,2017-10-25 17:51:08+00,What enables a hen 2 lay an egg wiz 2 yorks?,"[0.026525354, -0.042083185, -0.019407695, 0.04...",132,6.0,Eggs & Reproduction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800702,59086804,2482785,eng,q how many grams of growers mash and layers ma...,chicken,2021-10-14 04:08:37.686431+00,59086823,1124772,eng,q1341.50grams,...,live,ke,,,2018-08-23 18:09:02.308435+00,how many grams of growers mash and layers mash...,"[-0.025691262, -0.058828868, -0.019506676, -0....",-1,-1.0,Uncategorized
800705,59086916,3372507,eng,"Q has asked how to make broiler feed,i got the...",chicken,2021-10-14 06:38:58.383532+00,59086919,944466,eng,Q1325..am green about rationing,...,live,ke,,,2018-06-27 14:44:12.699012+00,"has asked how to make broiler feed,i got the c...","[-0.007124627, -0.10409252, -0.07471126, -0.00...",7,8.0,"Business, Markets, & Starting Poultry Projects"
800706,59086999,3509967,eng,Q what is the best tool can i use in cutting h...,chicken,2021-10-14 08:05:01.29044+00,59087003,2495321,eng,Q 343 u put the fire on only the lower beak,...,live,ke,,2005-05-12,2019-09-18 10:14:13.098818+00,what is the best tool can i use in cutting hen...,"[-0.024665521, 0.0027263055, 0.011872305, -0.0...",93,7.0,Housing & Equipment
800707,59087033,812660,eng,Which medicine can be administerd to chicks th...,chicken,2021-10-14 08:37:57.402273+00,59087910,3562750,eng,Q35 i dont know,...,live,ke,,,2021-02-08 05:48:50.697456+00,Which medicine can be administerd to chicks th...,"[0.03021913, -0.05749866, -0.008422105, 0.0237...",26,3.0,Pests & Disease


In [41]:
chicken_summary = summarize_clusters(chicken_df, 
                        text_col='Q_basic_clean', 
                        cluster_col='cluster', 
                        top_n_words=5, 
                        meta_col='meta_label',
                        sample_questions=5, 
                        random_samples=True, 
                        preview=True,
                        extra_stop_words=('chicken', 'chickens', 'hens', 'hen', 'chiken'))


=== Cluster -1 (size=34364) ===
Keywords: eggs, chicks, lay, chick, asks
Meta: -1.0
Sample questions:
  - How much money should i use to buy 100 hens
  - why Artificial Chickens Dont Atch Yet they mate bt those natural chickens atch and th same tym they mate
  - ,I need 20 layers(rainbow roosters) breed, where can I get and the cost?
  - Dickson asks: how can care for 1day chicken and what feeds Reply Q319 followed by your response. optout stop 6333
  - What Is The Meaning Of Broiler Mash

=== Cluster 0 (size=11062) ===
Keywords: vaccine, chicks, vaccinate, vaccination, vaccines
Meta: 1.0
Sample questions:
  - for one day old chicks what is the best vaccination used in starting?
  - what is the vaccination program of chicks from arching to big hens
  - which vaccine do we give chicken suffering from coccidiosis
  - APPROPRIATE/BEST VACCINES TO CONTROL DISEASES AGAINS CHICKEN.
  - My hens their droppings contain blood wat disease could and wat vaccine should I use in my farm

=== Clust

In [104]:
## ChatGPT meta cluster assignment 
cluster_to_meta = {
    **dict.fromkeys([0, 2, 36, 41, 43, 49, 59, 66, 67, 68, 70, 85, 105, 108, 118, 120], 1),  # Chick Care & Raising
    **dict.fromkeys([1, 8, 20, 21, 37, 40, 57, 58, 61, 62, 69, 78, 80, 81, 86, 88, 89, 91, 94, 98, 125], 2),  # Nutrition & Feeding
    **dict.fromkeys([3, 4, 5, 6, 9, 10, 11, 12, 13, 26, 30, 31, 33, 34, 47, 50, 51, 79, 84, 87, 92, 95, 100, 101, 102, 103, 109, 110, 111, 112, 114, 115, 116], 3),  # Pests & Disease
    **dict.fromkeys([23, 24, 25, 27, 28, 29, 32, 35, 44, 77], 4),  # Adult Chicken Health & Behavior
    **dict.fromkeys([15, 22, 42, 46, 53, 74, 75, 96, 97, 130, 104], 5),  # Breeds & Genetics
    **dict.fromkeys([16, 17, 38, 40, 52, 54, 63, 64, 65, 71, 72, 73, 76, 83, 99, 107, 113, 122, 124, 128, 129, 131, 132, 133], 6),  # Eggs & Reproduction
    **dict.fromkeys([39, 90, 93], 7),  # Housing & Equipment
    **dict.fromkeys([7, 14, 45, 48, 55, 56, 60, 106, 121, 123, 126, 127], 8),  # Business & Poultry Projects
    **dict.fromkeys([18, 19, 117], 9),  # Seasonal & Environmental Effects
    **dict.fromkeys([82, 119], -1),  # Uncategorized
}
chicken_df['meta_label'] = chicken_df['cluster'].map(cluster_to_meta)

In [105]:
meta_titles = {
    -1: "Uncategorized",
    1: "Chick Care & Raising",
    2: "Nutrition & Feeding",
    3: "Pests & Disease",
    4: "Adult Chicken Health & Behavior",
    5: "Breeds & Genetics",
    6: "Eggs & Reproduction",
    7: "Housing & Equipment",
    8: "Business, Markets, & Starting Poultry Projects",
    9: "Seasonal & Environmental Effects"
}
chicken_df.loc[chicken_df['cluster'] == -1, 'meta_label'] = -1
chicken_df['meta_label_title'] = chicken_df['meta_label'].map(meta_titles)


In [108]:
## Save final chicken files before visualizing
save_topic_files(data_dir = data_dir, 
                 df = chicken_df,
                 umap_embedding = chicken_umap,
                 clusterer = chicken_clusterer, 
                 topic = 'chicken')

Saved chicken_hdbscan_model.pkl
Saved chicken_umap_embedding.pkl
Saved chicken_clustered_df.parquet


In [107]:
chicken_df['meta_label_title'].value_counts()

meta_label_title
Chick Care & Raising                              48791
Pests & Disease                                   44513
Uncategorized                                     34902
Eggs & Reproduction                               33351
Nutrition & Feeding                               30683
Business, Markets, & Starting Poultry Projects    21647
Adult Chicken Health & Behavior                   16908
Breeds & Genetics                                 13232
Housing & Equipment                                4710
Seasonal & Environmental Effects                   2423
Name: count, dtype: int64

In [None]:
# Make sure to rerun summarize_clusters to access updated meta labels
metacluster_preview(chicken_summary, 
                    metacluster_num=1, 
                    meta_titles=meta_titles)


Previewing Meta-cluster 1: Chick Care & Raising

=== Cluster 0 (size=11062) ===
Keywords: vaccine, chicks, vaccinate, vaccination, vaccines
Sample questions:
  - for one day old chicks what is the best vaccination used in starting?
  - what is the vaccination program of chicks from arching to big hens
  - which vaccine do we give chicken suffering from coccidiosis
  - APPROPRIATE/BEST VACCINES TO CONTROL DISEASES AGAINS CHICKEN.
  - My hens their droppings contain blood wat disease could and wat vaccine should I use in my farm

=== Cluster 2 (size=5247) ===
Keywords: chiks, chics, best, start, check
Sample questions:
  - WHERE CAN I GET YOUR CHIKEN IN TAITA TAVETA COUNTY
  - what would i feed my chiken to have egs shells hard?
  - which medicine can i use to treat my 1 month old chic has fever spot on their mounth n legs
  - How do i care chibks
  - WHAT ARE CHIKEN DISEASES? SYIM PTOMS AND TREATMEMT

=== Cluster 36 (size=1377) ===
Keywords: rear, rearing, start, want, best
Sample quest

In [106]:
chicken_df.loc[chicken_df['meta_label'].isna(), 'cluster'].value_counts()

Series([], Name: count, dtype: int64)

In [109]:
umap_2d = umap.UMAP(
    n_components=2,
    n_neighbors=50,
    min_dist=0.1,
    metric="cosine",
    random_state=42
)

embedding_2d_chicken = umap_2d.fit_transform(list(chicken_df['embedding']))

quick_save_file(data_dir, "chicken_umap_2d_embedding.npy", embedding_2d_chicken)

Saved chicken_umap_2d_embedding.npy


### "Maize" questions analysis


In [68]:
maize_df, maize_umap, maize_clusterer = cluster_with_umap_hdbscan(
    df=questions[questions['question_topic'] == 'maize'],
    sample_size=1_000_000,
    umap_params={
        "n_neighbors": 30,
        "n_components": 5,
        "metric": "cosine",
        "random_state": 42
    },
    hdbscan_params={
        "min_cluster_size": 500,
        "min_samples": 10,
        "metric": "euclidean"
    }
)

Finished clustering in 401.9 seconds
Noise ratio: 29.11%
Clusters found: 83
Approx. silhouette score (excluding noise): 0.572


In [69]:
maize_summary = summarize_clusters(maize_df, 
                        text_col='Q_basic_clean', 
                        cluster_col='cluster', 
                        top_n_words=5, 
                        #meta_col='meta_label',
                        sample_questions=5, 
                        random_samples=True, 
                        preview=True,
                        extra_stop_words=('maize', 'corn', 'maiz', 'meize'))


=== Cluster -1 (size=53569) ===
Keywords: best, plant, type, planting, good
Sample questions:
  - WHEN TO PUT FARTALIZER TO MY MAIZE PLANTS AND WHICH ONE?
  - which type of a maize seed produces the best in a semi arid area
  - How many kilograms of cotton seed should add in 50kg of maize bland
  - which type of maize grows well in the swamp?
  - Which problems associated witth Delays in land preparation and planting of maize.

=== Cluster 0 (size=5997) ===
Keywords: worms, army, worm, control, armyworm
Sample questions:
  - UE:What is the best insecticide to kill armyworms on a maize farm?
  - my maize attack by some worms which medicine cant i apply?
  - how can fowl worms be controlled in maize plantations?
  - how can I control army worms from my maize?
  - Which is the best pesticide to control arm worm in maize

=== Cluster 1 (size=728) ===
Keywords: rats, monkeys, control, monkey, destroying
Sample questions:
  - what can I do to prevent monkeys from destroying my young maize w

In [70]:
## Runs HDBSCAN again on the noise cluster to extract more clusters
# Adjust paramaters for smaller subset of data
hdbscan_params_noise = {
    "min_cluster_size": 200,
    "min_samples": 10,
    "metric": "euclidean"
}

maize_noise_labels_shifted, maize_noise_clusterer = recluster_noise(
    umap_embeddings=maize_umap,
    labels=maize_df["cluster"].to_numpy(),
    hdbscan_params=hdbscan_params_noise
)

Reclustered noise points: 53569
New noise ratio: 52.89%
Clusters found in noise: 58


In [71]:
## Preview new clusters obtained from noise cluster for quick quality check

# Select only the original noise rows
maize_noise_mask = maize_df['cluster'] == -1
maize_noise_df = maize_df.loc[maize_noise_mask, ['Q_basic_clean']].copy()

# Assign the new noise cluster labels (length matches noise_df)
maize_noise_df['new_cluster'] = maize_noise_labels_shifted

print_cluster_examples(maize_noise_df, text_column='Q_basic_clean', cluster_column='new_cluster',examples_per_cluster=5)


--- Cluster 133 (size=1388) ---
- Which is the best maize crop to plant?
- where can i find gud maize seeds 4 planting.
- Which type of maize is good for farming?
- what alternative variety of maize can i use if do not get H 520
- #For planting maize, and saving the money then i buy the maize to the farmer when they havest which is the best?

--- Cluster 134 (size=1370) ---
- which maize variety can be grown in january
- when ti plant maize?
- when should one stop planting beans&maize at this season?
- I am in Endebess Kitale can i plant maize or to wait upto other days
- WHAT IS IMPORTANT OF SUNFLOWER TO MAIZE,WHEN PLANTED TOGETHER?

--- Cluster 90 (size=1154) ---
- how many times do farmes has to apply sulphate to maize
- ,how long is sulphate applied to maize after it has matured
- ,at what stage must a farmes apply sulphate to maize
- is it right to topdress maize with NPK?
- # what are the positives and negatives of using lime in planting maize

--- Cluster 112 (size=962) ---
- w

In [72]:
# Integrates new noise cluster labels back into the main dataframe
maize_df.loc[maize_df["cluster"] == -1, "cluster"] = maize_noise_labels_shifted

In [88]:
maize_summary = summarize_clusters(maize_df, 
                        text_col='Q_basic_clean', 
                        cluster_col='cluster', 
                        top_n_words=5, 
                        meta_col='meta_label',
                        sample_questions=5, 
                        random_samples=True, 
                        preview=True,
                        extra_stop_words=('maize', 'corn', 'maiz', 'meize'))


=== Cluster -1 (size=28332) ===
Keywords: plant, best, type, planting, use
Meta: -1.0
Sample questions:
  - Am gerald from iganga how much is kilogram of maize in mbale
  - Why Kamwegye Most Farmer Are Growing Maize
  - HOW much money did you maize cost each.
  - How many cups of maize hasks(chachu) is apig supposed to take per meal?
  - I mean will i be allawed tm borrow maize seeds then i pay slowlly? pollitely

=== Cluster 0 (size=5997) ===
Keywords: worms, army, worm, control, armyworm
Meta: 1.0
Sample questions:
  - UE:What is the best insecticide to kill armyworms on a maize farm?
  - my maize attack by some worms which medicine cant i apply?
  - how can fowl worms be controlled in maize plantations?
  - how can I control army worms from my maize?
  - Which is the best pesticide to control arm worm in maize

=== Cluster 1 (size=728) ===
Keywords: rats, monkeys, control, monkey, destroying
Meta: 1.0
Sample questions:
  - what can I do to prevent monkeys from destroying my young m

In [82]:
# Formatted meta cluster assignment by ChatGPT-- check for missing assignments
cluster_to_meta_maize = {
    **dict.fromkeys([0, 1, 2, 15, 22, 26, 27, 29, 42, 47, 48, 50, 52, 53, 58, 83, 85, 87, 93, 97, 98, 107, 113, 114, 115, 116, 117, 122, 128], 1),  # Pests & Disease
    **dict.fromkeys([3, 4, 6, 7, 8, 10, 13, 14, 16, 17, 18, 19, 20, 51, 54, 55, 56, 77, 79, 84, 88, 90, 103, 104, 118, 120, 123, 127, 129, 134, 136, 138, 139], 2),  # Fertilizer, Soil, & Planting Practices
    **dict.fromkeys([12, 21, 23, 24, 25, 38, 39, 40, 41, 43, 46, 60, 65, 68, 69, 70, 71, 76, 78, 80, 81, 82, 91, 94, 95, 99, 100, 101, 102, 105, 106, 121, 124, 126, 133], 3),  # Seed Varieties & Regional Adaptation
    **dict.fromkeys([31, 33, 59, 63, 64, 66, 67, 72, 74, 75, 125, 131, 132, 135], 4),  # Yield & Farm Output
    **dict.fromkeys([9, 11, 28, 30, 32, 34, 35, 36, 37, 92, 96, 108, 109, 110, 111, 112, 119, 137], 5),  # Market & Price Information
    **dict.fromkeys([44, 45, 86, 89, 130], 6),  # Animal Feed / Alternative Uses
    **dict.fromkeys([49, 57, 61, 62, 140], 7),  # Wefarm / Miscellaneous
    **dict.fromkeys([5, 73], -1),  # Uncategorized / Noise
}
maize_df['meta_label'] = maize_df['cluster'].map(cluster_to_meta_maize)


In [91]:
meta_titles_maize = {
    -1: "Uncategorized / Noise",
    1: "Pests & Disease",                  # armyworms, caterpillars, weevils, rats/monkeys, smut, yellowing
    2: "Fertilizer, Soil, & Planting Practices",  # soil types, manure, fertilizers, DAP/NPK, spacing, seeds per hole, top dressing
    3: "Seed Varieties & Regional Adaptation",    # general seeds, hybrids, region-specific/adaptation, climate
    4: "Yield & Farm Output",              # expected harvest per acre, bags produced, kgs per acre
    5: "Market & Price Information",       # maize price per kg/bag, selling markets, economic considerations
    6: "Animal Feed / Alternative Uses",   # feeding livestock, maize byproducts
    7: "Wefarm Platform / Miscellaneous"   # platform questions, unclear/miscellaneous queries
}
maize_df.loc[maize_df['cluster'] == -1, 'meta_label'] = -1
maize_df['meta_label_title'] = maize_df['meta_label'].map(meta_titles_maize)

In [92]:
# Check for missing assignments in compact list
maize_df.loc[maize_df['meta_label_title'].isna(), 'cluster'].value_counts()

Series([], Name: count, dtype: int64)

In [95]:
## Save final maize files before visualizing
save_topic_files(data_dir = data_dir, 
                 df = maize_df,
                 umap_embedding = maize_umap,
                 clusterer = maize_clusterer, 
                 topic = 'maize')

Saved maize_hdbscan_model.pkl
Saved maize_umap_embedding.pkl
Saved maize_clustered_df.parquet


In [93]:
maize_df['meta_label_title'].value_counts()

meta_label_title
Fertilizer, Soil, & Planting Practices    39205
Pests & Disease                           33916
Seed Varieties & Regional Adaptation      33155
Uncategorized / Noise                     32416
Market & Price Information                24726
Yield & Farm Output                       14431
Wefarm Platform / Miscellaneous            3610
Animal Feed / Alternative Uses             2570
Name: count, dtype: int64

In [99]:
# Make sure to rerun summarize_clusters to access updated meta labels
metacluster_preview(maize_summary, 
                    metacluster_num=5, 
                    meta_titles=meta_titles_maize)

Previewing Meta-cluster 5: Market & Price Information

=== Cluster 9 (size=696) ===
Keywords: kampala, price, kg, current, da
Sample questions:
  - What is the cost of maize in kampala and kamuli
  - E the price of maize in kampala
  - A farmer asks: What is the price of maize in Kampala?
  - How Much Is New Maize In Kampala?
  - market 4 new maize in kampala

=== Cluster 11 (size=1244) ===
Keywords: uganda, price, types, western, current
Sample questions:
  - Which maize seeds can do well in Western Uganda--A4001 Plus
  - Joan asks: asking price for maize in uganda Reply Q84
  - Is It True That Uganda Produce More Maize Compared To Other East Africa Countries
  - How Do Maize Help In Food Staff Of Uganda
  - which type of maize is of good quality in uganda?

=== Cluster 28 (size=955) ===
Keywords: akg, akilo, akilogram, hw, price
Sample questions:
  - how much does akg of maize per now cost ,my maize needs spraing which fatilaize shuold iuse . and the inscent are eating adevience me a

In [110]:
umap_2d = umap.UMAP(
    n_components=2,
    n_neighbors=50,
    min_dist=0.1,
    metric="cosine",
    random_state=42
)

embedding_2d_maize = umap_2d.fit_transform(list(maize_df['embedding']))

quick_save_file(data_dir, "maize_umap_2d_embedding.npy", embedding_2d_maize)

Saved maize_umap_2d_embedding.npy


In [None]:
# Save question clusters with 2D embeddings locally for merging with full dataset later
# save_question_clusters(df = chicken_df,
#                        embedding_2d = embedding_2d_chicken,
#                        topic = "chicken")


# save_question_clusters(df = maize_df,
#                        embedding_2d = embedding_2d_maize,
#                        topic = "maize")

Saved ../data/question_clusters_chicken.parquet (251160 rows)
Saved ../data/question_clusters_maize.parquet (184029 rows)
