In [34]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import joblib
import duckdb
import sys
from pathlib import Path

# Import from local modules
sys.path.append('../src')
import clustering_analysis
import processing_and_visualization

import importlib
importlib.reload(clustering_analysis)
importlib.reload(processing_and_visualization)

from clustering_analysis import print_cluster_examples, summarize_clusters, metacluster_preview
from processing_and_visualization import load_clustered_questions

load_dotenv()
csv_path = os.getenv("DATA_CSV")
parquet_path = os.getenv("DATA_PARQUET")

## Final Analysis of Question Clustering
This notebook reconstructs the clustered question datasets for the *unlabeled*, *chicken*, and *maize* subsets using the provided cluster label files (<20 MB) and the original full dataset. It also includes tools for inspecting and visualizing the cluster structure.

Together, these pieces allow users to explore the final clustering results without rerunning the full UMAP + HDBSCAN pipeline.

**Note:** Cluster labels are specific to each subset and should not be mixed across datasets.

In [40]:
# Reassemble dataframes.
unlabeled_path = Path('../data/question_clusters_unlabeled.parquet')
chicken_path = Path('../data/question_clusters_chicken.parquet')
maize_path = Path('../data/question_clusters_maize.parquet')

unlabeled_df = load_clustered_questions(parquet_path, unlabeled_path, topic=None)
chicken_df = load_clustered_questions(parquet_path, chicken_path, topic="chicken")
maize_df = load_clustered_questions(parquet_path, maize_path, topic="maize")

In [None]:
# Map meta_label integers to descriptive titles
unlabeled_meta_titles = {
    -1: "Uncategorized", 1: "Soil & Fertilizer", 2: "Pests & Disease", 3: "Animal Husbandry", 4: "Planting & Growth", 
    5: "Markets", 6: "Finance & Loans", 7: "Farming Equipment & Materials", 8: "Weather & Environment", 
    9: "Wefarm Platform",10: "Personal Communication"
}

chicken_meta_titles = {
    -1: "Uncategorized", 1: "Chick Care & Raising", 2: "Nutrition & Feeding", 3: "Pests & Disease",
    4: "Adult Chicken Health & Behavior", 5: "Breeds & Genetics", 6: "Eggs & Reproduction",
    7: "Housing & Equipment", 8: "Business, Markets, & Starting Poultry Projects", 9: "Seasonal & Environmental Effects"
}

maize_meta_titles = {
    -1: "Uncategorized / Noise", 1: "Pests & Disease", 2: "Fertilizer, Soil, & Planting Practices", 3: "Seed Varieties & Regional Adaptation", 
    4: "Yield & Farm Output", 5: "Market & Price Information", 6: "Animal Feed / Alternative Uses", 7: "Wefarm Platform / Miscellaneous"   
}


unlabeled_df['meta_label_titles'] = unlabeled_df['meta_label'].map(unlabeled_meta_titles)
chicken_df['meta_label_titles'] = chicken_df['meta_label'].map(chicken_meta_titles)
maize_df['meta_label_titles'] = maize_df['meta_label'].map(maize_meta_titles)

In [45]:
maize_df['meta_label_titles'].value_counts()

meta_label_titles
Fertilizer, Soil, & Planting Practices    39205
Pests & Disease                           33916
Seed Varieties & Regional Adaptation      33155
Uncategorized / Noise                     32416
Market & Price Information                24726
Yield & Farm Output                       14431
Wefarm Platform / Miscellaneous            3610
Animal Feed / Alternative Uses             2570
Name: count, dtype: int64

In [None]:
meta_label_title
Chick Care & Raising                              48791
Pests & Disease                                   44513
Uncategorized                                     34902
Eggs & Reproduction                               33351
Nutrition & Feeding                               30683
Business, Markets, & Starting Poultry Projects    21647
Adult Chicken Health & Behavior                   16908
Breeds & Genetics                                 13232
Housing & Equipment                                4710
Seasonal & Environmental Effects                   2423
Name: count, dtype: int64