# Predicting question topic from weather forecasts

This is the cleaned up code used to train a question topic prediction model. The first cell loads the data; the filepath and file names should be updated as needed.

The data in the original analysis and training was loaded without compression. If the datasets are too large, you may need to do additional steps here to load it all.

In [2]:
#------------------#
# Loading the data #
#------------------#

import os
import numpy as np
import pandas as pd

# Replace this string with the filepath to the datasets on your machine!
filepath = r'C:\\Users\\quinn\\Downloads\\'

# If for some reason the datasets have been renamed, update the following strings (We Farm dataset, Uganda weather, Tanzania weather, and Kenya weather,
# respectively)
topics = 'DataKit_Fall_2025.csv'
uganda = 'UGA_weather_data.csv'
tanzania = 'TZA_weather_data.csv'
kenya = 'KEN_weather_data.csv'

# Load the datasets
df_topics = pd.read_csv(filepath+topics)
df_ug_w = pd.read_csv(filepath+uganda)
df_tz_w = pd.read_csv(filepath+tanzania)
df_ke_w = pd.read_csv(filepath+kenya)

---

In [4]:
#------------------------------------#
# Set up the dataframe for the model #
#------------------------------------#

# Some questions have multiple rows; remove the duplicates
df_topics = df_topics.drop_duplicates(subset=['question_content'])

# Convert the timestamps to YYYY-MM format
df_topics['question_sent'] = pd.to_datetime(df_topics['question_sent'],format='mixed').dt.strftime("%Y-%m")


# We only need to keep the question asker's topic, country code, and timestamp columns from the We Farm dataset
# Drop the unneeded columns and remove any remaining rows with NA values
keep_columns = ['question_topic','question_user_country_code','question_sent']
df_topics = df_topics[keep_columns].dropna()

# Now, split up country-specific dataframes
df_ug_topics = df_topics[df_topics.question_user_country_code == 'ug']
df_tz_topics = df_topics[df_topics.question_user_country_code == 'tz']
df_ke_topics = df_topics[df_topics.question_user_country_code == 'ke']

# Merge the question topics with the weather data, matching on the year and month of the data
df_ug_merged = pd.merge(df_ug_topics,df_ug_w,how='inner',left_on='question_sent',right_on='index')
df_tz_merged = pd.merge(df_tz_topics,df_tz_w,how='inner',left_on='question_sent',right_on='index')
df_ke_merged = pd.merge(df_ke_topics,df_ke_w,how='inner',left_on='question_sent',right_on='index')

# Put all of our rows together into one dataframe again
df_model = pd.concat([df_ug_merged,df_tz_merged,df_ke_merged])

# Retain only the columns used in the model and interpolate missing weather data
features = ['avg_max_temp','precipitation','relative_humidity','avg_min_temp']
labels = ['question_topic']
df_model = df_model[features+labels].interpolate(method='linear')

# Lastly, retain only the question topics that will be used in the model
topic_list = ['bean', 'cattle', 'chicken', 'coffee', 'goat', 'maize', 'pig', 'potato', 'poultry', 'rabbit', 'tomato']
df_model = df_model[df_model['question_topic'].isin(topic_list)]

  df_model = df_model[features+labels].interpolate(method='linear')


---

In [6]:
#--------------------#
# Training the model #
#--------------------#

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


X = df_model[features]
y = df_model['question_topic']


# Scale the data
col_x = ColumnTransformer([
    ], 
    remainder=StandardScaler())

col_x.fit(X);
X_transformed = col_x.transform(X[features])

# Seed specified for the sake of reproducability
X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size=0.2, random_state=42)


# 'balanced' class weighting due to the imbalance of classes
rf_pipe = Pipeline([('xform', col_x),
                    ('rf', RandomForestClassifier(random_state=42, class_weight='balanced'))])

# Parameters for the model (determined based on previous grid searches):
parameters = {
    'rf__n_estimators': [100],
    'rf__max_features': [None],
    'rf__max_depth': [9],
    'rf__min_samples_split': [2],
    'rf__min_samples_leaf': [1]
}

# This model will be saved as rf_gs2:
rf_gs = GridSearchCV(rf_pipe, 
                     cv=5, 
                     param_grid=parameters,
                    )

rf_gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('xform',
                                        ColumnTransformer(remainder=StandardScaler(),
                                                          transformers=[])),
                                       ('rf',
                                        RandomForestClassifier(class_weight='balanced',
                                                               random_state=42))]),
             param_grid={'rf__max_depth': [9], 'rf__max_features': [None],
                         'rf__min_samples_leaf': [1],
                         'rf__min_samples_split': [2],
                         'rf__n_estimators': [100]})

In [7]:
#-------------------#
# Model performance #
#-------------------#

from sklearn.metrics import f1_score

y_pred = rf_gs.predict(X_test)

# Due to the imbalanced classes, just looking at F1 score here
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-score: {f1:.2f}")

F1-score: 0.19


---

In [9]:
#------------------#
# Export the model #
#------------------#

import pickle

# Model to be pickled
model = rf_gs

# Pickling the model
with open('QTOPICmodel.pkl', 'wb') as f:
    pickle.dump(model, f)