In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, pearsonr
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

In [2]:
df = pd.read_csv('/Users/ibrahimyucel/Downloads/data-kind/data-sources/UGA/UGA_proper_for_analyzing.csv')

print("="*80)
print("DATASET OVERVIEW")
print("="*80)
print(f"Total rows: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"\nDate range: {df['question_year'].min()}-{df['question_month'].min():02d} to {df['question_year'].max()}-{df['question_month'].max():02d}")
print(f"\nColumns:")
for col in df.columns:
    print(f"  - {col}")

# Basic info
print("\n" + "="*80)
print("DATA TYPES AND MISSING VALUES")
print("="*80)
print(df.info())

DATASET OVERVIEW
Total rows: 1,268,716
Total columns: 20

Date range: 2017-01 to 2021-12

Columns:
  - question_id
  - question_content
  - question_topic
  - response_content
  - response_topic
  - question_year
  - question_month
  - question_day
  - question_hour
  - response_year
  - response_month
  - response_day
  - response_hour
  - question_year_month
  - avg_max_temp
  - precipitation
  - relative_humidity
  - avg_min_temp
  - avg_rx1day
  - avg_days_r20mm

DATA TYPES AND MISSING VALUES
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1268716 entries, 0 to 1268715
Data columns (total 20 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   question_id          1268716 non-null  int64  
 1   question_content     1268716 non-null  object 
 2   question_topic       1268716 non-null  object 
 3   response_content     1268716 non-null  object 
 4   response_topic       1268716 non-null  object 
 5   question_year   

In [3]:
def get_uganda_season(month):
    if month in [12, 1, 2]:
        return 'short dry'
    elif month in [3, 4, 5]:
        return 'long rains'
    elif month in [6, 7, 8]:
        return 'long dry'
    elif month in [9, 10, 11]:
        return 'short rains'
    else:
        return 'unknown'

df["season"] = df["question_month"].apply(get_uganda_season)

In [4]:
df["period_of_day"] = df["question_hour"].apply(
    lambda x: (
        "night" if (22 <= x <= 23 or 0 <= x <= 5)
        else "morning" if 6 <= x <= 10
        else "afternoon" if 11 <= x <= 16
        else "evening"
    )
)

In [5]:
print("\n" + "="*80)
print("WEATHER DATA STRUCTURE ANALYSIS")
print("="*80)

# Verify that weather values are constant within year-month
weather_cols = ['avg_max_temp', 'precipitation', 'relative_humidity', 
                'avg_min_temp', 'avg_rx1day', 'avg_days_r20mm']

# Group by year-month and check uniqueness
weather_monthly = df.groupby(['question_year', 'question_month'])[weather_cols].agg(['nunique', 'mean', 'std'])
print("\nNumber of unique values per year-month for each weather variable:")
print(weather_monthly['avg_max_temp']['nunique'].describe())

# Show sample of weather data by year-month
monthly_weather = df.groupby(['question_year', 'question_month'])[weather_cols].first().reset_index()
monthly_weather['year_month'] = monthly_weather['question_year'].astype(str) + '-' + monthly_weather['question_month'].astype(str).str.zfill(2)
print(f"\nUnique year-month periods: {monthly_weather['year_month'].nunique()}")
print("\nSample of monthly weather data:")
print(monthly_weather.head(10))

# Create master monthly weather dataset
df_monthly_weather = monthly_weather.copy()
print(f"\nCreated monthly weather dataset with {len(df_monthly_weather)} rows")


WEATHER DATA STRUCTURE ANALYSIS

Number of unique values per year-month for each weather variable:
count    47.0
mean      1.0
std       0.0
min       1.0
25%       1.0
50%       1.0
75%       1.0
max       1.0
Name: nunique, dtype: float64

Unique year-month periods: 47

Sample of monthly weather data:
   question_year  question_month  avg_max_temp  precipitation  \
0           2017              11         27.44         147.35   
1           2017              12         29.82          45.54   
2           2018               1         29.77          39.79   
3           2018               2         31.37          57.41   
4           2018               3         27.60         184.87   
5           2018               4         26.26         285.07   
6           2018               5         26.68         193.38   
7           2018               6         26.53         118.20   
8           2018               7         27.03          62.66   
9           2018               8         26.

In [6]:
print("\n" + "="*80)
print("QUESTION TOPICS DISTRIBUTION")
print("="*80)

# Extract and count topics
from ast import literal_eval

def safe_eval(x):
    try:
        return literal_eval(x) if isinstance(x, str) else x
    except:
        return []

df['topics_list'] = df['question_topic'].apply(safe_eval)

# Get all individual topics
all_topics = []
for topics in df['topics_list']:
    if isinstance(topics, list):
        all_topics.extend([t for t in topics if t != 'nan'])

topic_counts = pd.Series(all_topics).value_counts()
print(f"\nTotal unique topics: {len(topic_counts)}")
print(f"\nTop 30 most frequent topics:")
print(topic_counts.head(30))

# Add dominant topic column (first topic in list)
df['dominant_topic'] = df['topics_list'].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 and x[0] != 'nan' else 'unknown'
)

print(f"\nDominant topics distribution:")
print(df['dominant_topic'].value_counts().head(20))


QUESTION TOPICS DISTRIBUTION

Total unique topics: 145

Top 30 most frequent topics:
maize            90367
plant            88600
chicken          76549
tomato           71583
pig              69690
cattle           63286
banana           59485
bean             48549
crop             47816
coffee           40537
poultry          31659
animal           29767
goat             27137
rabbit           26217
cabbage          25428
watermelon       18691
potato           18578
cassava          16959
rice             16516
passion-fruit    15485
onion            14211
peanut           13240
bird             11082
wheat            10687
bee              10574
aubergine         9584
tree              9218
fish              7585
millet            6648
sugar-cane        6610
Name: count, dtype: int64

Dominant topics distribution:
dominant_topic
unknown       344592
maize          78925
chicken        71284
pig            65252
tomato         64612
cattle         58673
plant          51618
banan

In [7]:
print("\n" + "="*80)
print("TEMPORAL PATTERNS ANALYSIS")
print("="*80)

# Questions per month
df['year_month'] = df['question_year'].astype(str) + '-' + df['question_month'].astype(str).str.zfill(2)
monthly_counts = df.groupby('year_month').size().reset_index(name='question_count')
monthly_counts = monthly_counts.merge(df_monthly_weather, left_on='year_month', right_on='year_month', how='left')

print("\nMonthly question volumes:")
print(monthly_counts[['year_month', 'question_count', 'precipitation', 'avg_max_temp']].head(15))

# Questions by season
season_counts = df.groupby('season').size().reset_index(name='count')
print("\n" + "="*80)
print("Questions by Season:")
print(season_counts)


TEMPORAL PATTERNS ANALYSIS

Monthly question volumes:
   year_month  question_count  precipitation  avg_max_temp
0     2017-11            2476         147.35         27.44
1     2017-12            7222          45.54         29.82
2     2018-01           11031          39.79         29.77
3     2018-02            5601          57.41         31.37
4     2018-03            7799         184.87         27.60
5     2018-04           12396         285.07         26.26
6     2018-05           11890         193.38         26.68
7     2018-06           12283         118.20         26.53
8     2018-07           24628          62.66         27.03
9     2018-08           38991         143.20         26.92
10    2018-09           44555         103.76         28.10
11    2018-10           38554         170.27         27.94
12    2018-11           53139         107.29         28.65
13    2018-12           50191         104.09         28.37
14    2019-01           29459          49.25         30.36



In [8]:
print("\n" + "="*80)
print("DEFINING WEATHER CONDITION CATEGORIES")
print("="*80)

# Analyze weather distributions to define thresholds
print("\nWeather variable distributions:")
print(df_monthly_weather[weather_cols].describe())

# Define weather categories based on quartiles
df_monthly_weather['temp_category'] = pd.cut(
    df_monthly_weather['avg_max_temp'],
    bins=[0, df_monthly_weather['avg_max_temp'].quantile(0.33), 
          df_monthly_weather['avg_max_temp'].quantile(0.67), 100],
    labels=['Cool', 'Moderate', 'Hot']
)

df_monthly_weather['rain_category'] = pd.cut(
    df_monthly_weather['precipitation'],
    bins=[0, df_monthly_weather['precipitation'].quantile(0.33),
          df_monthly_weather['precipitation'].quantile(0.67), 1000],
    labels=['Low Rainfall', 'Moderate Rainfall', 'High Rainfall']
)

df_monthly_weather['humidity_category'] = pd.cut(
    df_monthly_weather['relative_humidity'],
    bins=[0, df_monthly_weather['relative_humidity'].quantile(0.33),
          df_monthly_weather['relative_humidity'].quantile(0.67), 100],
    labels=['Low Humidity', 'Moderate Humidity', 'High Humidity']
)

# Extreme weather events
df_monthly_weather['extreme_rain'] = df_monthly_weather['precipitation'] > df_monthly_weather['precipitation'].quantile(0.90)
df_monthly_weather['drought'] = df_monthly_weather['precipitation'] < df_monthly_weather['precipitation'].quantile(0.10)
df_monthly_weather['heatwave'] = df_monthly_weather['avg_max_temp'] > df_monthly_weather['avg_max_temp'].quantile(0.90)

print("\nWeather categories created:")
print(df_monthly_weather[['year_month', 'temp_category', 'rain_category', 'humidity_category', 
                          'extreme_rain', 'drought', 'heatwave']].head(10))

print("\nExtreme weather event counts:")
print(f"Extreme rain months: {df_monthly_weather['extreme_rain'].sum()}")
print(f"Drought months: {df_monthly_weather['drought'].sum()}")
print(f"Heatwave months: {df_monthly_weather['heatwave'].sum()}")


DEFINING WEATHER CONDITION CATEGORIES

Weather variable distributions:
       avg_max_temp  precipitation  relative_humidity  avg_min_temp  \
count     47.000000      47.000000          47.000000     47.000000   
mean      27.998723     134.860851          69.872766     19.370426   
std        1.401624      61.874615           8.225809      0.611612   
min       26.260000      39.090000          51.660000     18.440000   
25%       26.945000      84.100000          66.130000     18.930000   
50%       27.600000     143.200000          73.190000     19.240000   
75%       28.785000     179.640000          76.145000     19.690000   
max       31.370000     285.070000          80.060000     20.800000   

       avg_rx1day  avg_days_r20mm  
count   47.000000       47.000000  
mean    20.568936        0.946383  
std      7.198345        0.672013  
min      6.610000        0.090000  
25%     15.445000        0.355000  
50%     20.790000        0.730000  
75%     25.670000        1.385000  


In [9]:
print("\n" + "="*80)
print("ENRICHING QUESTION DATA WITH WEATHER CATEGORIES")
print("="*80)

# Merge weather categories back to main dataset
df = df.merge(
    df_monthly_weather[['year_month', 'temp_category', 'rain_category', 'humidity_category', 
                        'extreme_rain', 'drought', 'heatwave'] + weather_cols],
    on='year_month',
    how='left',
    suffixes=('', '_monthly')
)

print(f"\nDataset now has {len(df)} rows and {len(df.columns)} columns")
print("\nSample of enriched data:")
print(df[['year_month', 'dominant_topic', 'temp_category', 'rain_category', 'season']].head(10))


ENRICHING QUESTION DATA WITH WEATHER CATEGORIES

Dataset now has 1268716 rows and 37 columns

Sample of enriched data:
  year_month dominant_topic temp_category      rain_category       season
0    2017-11        unknown      Moderate  Moderate Rainfall  short rains
1    2017-11        poultry      Moderate  Moderate Rainfall  short rains
2    2017-11         tomato      Moderate  Moderate Rainfall  short rains
3    2017-11         coffee      Moderate  Moderate Rainfall  short rains
4    2017-11         coffee      Moderate  Moderate Rainfall  short rains
5    2017-11        unknown      Moderate  Moderate Rainfall  short rains
6    2017-11        unknown      Moderate  Moderate Rainfall  short rains
7    2017-11            pig      Moderate  Moderate Rainfall  short rains
8    2017-11         tomato      Moderate  Moderate Rainfall  short rains
9    2017-11        cassava      Moderate  Moderate Rainfall  short rains


In [10]:
print("\n" + "="*80)
print("TOPIC DISTRIBUTION BY WEATHER CONDITIONS")
print("="*80)

# Analyze top topics by rainfall category
print("\nTop 10 topics during DIFFERENT RAINFALL CONDITIONS:")
print("\n" + "-"*80)
for rain_cat in ['Low Rainfall', 'Moderate Rainfall', 'High Rainfall']:
    rain_df = df[df['rain_category'] == rain_cat]
    print(f"\n{rain_cat.upper()} (n={len(rain_df):,}):")
    topic_dist = rain_df['dominant_topic'].value_counts().head(10)
    for topic, count in topic_dist.items():
        pct = (count / len(rain_df)) * 100
        print(f"  {topic:20s}: {count:6,} ({pct:5.2f}%)")

# Analyze top topics by temperature category
print("\n" + "="*80)
print("\nTop 10 topics during DIFFERENT TEMPERATURE CONDITIONS:")
print("-"*80)
for temp_cat in ['Cool', 'Moderate', 'Hot']:
    temp_df = df[df['temp_category'] == temp_cat]
    print(f"\n{temp_cat.upper()} (n={len(temp_df):,}):")
    topic_dist = temp_df['dominant_topic'].value_counts().head(10)
    for topic, count in topic_dist.items():
        pct = (count / len(temp_df)) * 100
        print(f"  {topic:20s}: {count:6,} ({pct:5.2f}%)")


TOPIC DISTRIBUTION BY WEATHER CONDITIONS

Top 10 topics during DIFFERENT RAINFALL CONDITIONS:

--------------------------------------------------------------------------------

LOW RAINFALL (n=339,246):
  unknown             : 93,197 (27.47%)
  maize               : 23,942 ( 7.06%)
  chicken             : 19,781 ( 5.83%)
  pig                 : 18,204 ( 5.37%)
  cattle              : 15,814 ( 4.66%)
  tomato              : 14,875 ( 4.38%)
  plant               : 13,487 ( 3.98%)
  banana              : 12,975 ( 3.82%)
  crop                : 10,756 ( 3.17%)
  bean                : 10,348 ( 3.05%)

MODERATE RAINFALL (n=467,252):
  unknown             : 126,057 (26.98%)
  chicken             : 26,637 ( 5.70%)
  tomato              : 25,802 ( 5.52%)
  maize               : 25,550 ( 5.47%)
  pig                 : 23,724 ( 5.08%)
  cattle              : 21,346 ( 4.57%)
  banana              : 20,814 ( 4.45%)
  plant               : 18,580 ( 3.98%)
  crop                : 15,121 ( 3.24%)
  b

In [11]:
print("\n" + "="*80)
print("PEST AND DISEASE QUESTION PATTERNS")
print("="*80)

# Define pest/disease related keywords
pest_disease_keywords = [
    'pest', 'disease', 'worm', 'aphid', 'blight', 'rot', 'virus', 'fungus',
    'tick', 'mite', 'fly', 'beetle', 'caterpillar', 'weevil', 'thrip',
    'mildew', 'wilt', 'leaf', 'spray', 'chemical', 'insecticide', 'fungicide'
]

def has_pest_disease_content(row):
    """Check if question content contains pest/disease keywords"""
    content = str(row['question_content']).lower()
    topic = str(row['dominant_topic']).lower()
    return any(keyword in content or keyword in topic for keyword in pest_disease_keywords)

df['is_pest_disease'] = df.apply(has_pest_disease_content, axis=1)

print(f"\nTotal pest/disease related questions: {df['is_pest_disease'].sum():,} ({(df['is_pest_disease'].sum()/len(df))*100:.2f}%)")

# Analyze pest/disease questions by rainfall
pest_by_rain = df.groupby('rain_category')['is_pest_disease'].agg(['sum', 'count'])
pest_by_rain['percentage'] = (pest_by_rain['sum'] / pest_by_rain['count']) * 100
print("\nPest/Disease Questions by Rainfall Category:")
print(pest_by_rain)

# Analyze pest/disease questions by humidity
pest_by_humidity = df.groupby('humidity_category')['is_pest_disease'].agg(['sum', 'count'])
pest_by_humidity['percentage'] = (pest_by_humidity['sum'] / pest_by_humidity['count']) * 100
print("\nPest/Disease Questions by Humidity Category:")
print(pest_by_humidity)

# Extreme weather impact on pest/disease questions
print("\nPest/Disease Questions during Extreme Weather:")
print(f"During extreme rain: {df[df['extreme_rain']]['is_pest_disease'].mean()*100:.2f}%")
print(f"During drought: {df[df['drought']]['is_pest_disease'].mean()*100:.2f}%")
print(f"During heatwave: {df[df['heatwave']]['is_pest_disease'].mean()*100:.2f}%")
print(f"Normal conditions: {df[~(df['extreme_rain'] | df['drought'] | df['heatwave'])]['is_pest_disease'].mean()*100:.2f}%")



PEST AND DISEASE QUESTION PATTERNS

Total pest/disease related questions: 157,541 (12.42%)

Pest/Disease Questions by Rainfall Category:
                     sum   count  percentage
rain_category                               
Low Rainfall       38318  339246   11.295048
Moderate Rainfall  58301  467252   12.477421
High Rainfall      60922  462218   13.180361

Pest/Disease Questions by Humidity Category:
                     sum   count  percentage
humidity_category                           
Low Humidity       40754  356516   11.431184
Moderate Humidity  58380  459422   12.707271
High Humidity      58407  452778   12.899699

Pest/Disease Questions during Extreme Weather:
During extreme rain: 13.28%
During drought: 11.13%
During heatwave: 10.77%
Normal conditions: 12.46%


In [12]:
print("\n" + "="*80)
print("CROP-SPECIFIC WEATHER SENSITIVITY ANALYSIS")
print("="*80)

# Define major crop categories
major_crops = ['maize', 'tomato', 'cabbage', 'bean', 'potato', 'cattle', 'chicken', 
               'onion', 'kale', 'banana', 'coffee', 'tea']

# Analyze question volume changes for each crop across weather conditions
crop_weather_analysis = []

for crop in major_crops:
    crop_df = df[df['dominant_topic'] == crop]
    if len(crop_df) < 100:  # Skip crops with too few questions
        continue
    
    # Calculate proportion of questions in each rainfall category
    rain_dist = crop_df.groupby('rain_category').size()
    total_crop_questions = len(crop_df)
    
    # Calculate overall rainfall distribution
    overall_rain_dist = df.groupby('rain_category').size()
    
    analysis = {
        'crop': crop,
        'total_questions': total_crop_questions,
        'low_rain_pct': (rain_dist.get('Low Rainfall', 0) / total_crop_questions) * 100,
        'moderate_rain_pct': (rain_dist.get('Moderate Rainfall', 0) / total_crop_questions) * 100,
        'high_rain_pct': (rain_dist.get('High Rainfall', 0) / total_crop_questions) * 100,
    }
    
    crop_weather_analysis.append(analysis)

crop_analysis_df = pd.DataFrame(crop_weather_analysis)
crop_analysis_df = crop_analysis_df.sort_values('total_questions', ascending=False)

print("\nCrop Question Distribution by Rainfall:")
print(crop_analysis_df.to_string(index=False))

# Identify crops most sensitive to rainfall changes
crop_analysis_df['rain_sensitivity'] = crop_analysis_df[['low_rain_pct', 'high_rain_pct']].std(axis=1)
print("\nCrops Most Sensitive to Rainfall (by std deviation):")
print(crop_analysis_df.sort_values('rain_sensitivity', ascending=False)[['crop', 'rain_sensitivity', 'total_questions']].head(10))



CROP-SPECIFIC WEATHER SENSITIVITY ANALYSIS

Crop Question Distribution by Rainfall:
   crop  total_questions  low_rain_pct  moderate_rain_pct  high_rain_pct
  maize            78925     30.335128          32.372506      37.292366
chicken            71284     27.749565          37.367432      34.883003
 tomato            64612     23.022039          39.933758      37.044202
 cattle            58673     26.952772          36.381300      36.665928
 banana            51119     25.381952          40.716759      33.901289
   bean            40014     25.860949          34.582896      39.556155
 coffee            35499     25.583819          37.178512      37.237669
cabbage            22504     22.204941          39.108603      38.686456
 potato            15436     27.371081          34.860067      37.768852
  onion            12473     23.025736          37.905877      39.068388
    tea             2004     25.249501          37.375250      37.375250
   kale              693     20.057720 

In [13]:
print("\n" + "="*80)
print("MONTHLY TIME SERIES CORRELATIONS")
print("="*80)

# Aggregate questions by month with weather data
monthly_aggregated = df.groupby('year_month').agg({
    'question_id': 'count',
    'is_pest_disease': 'sum',
    'avg_max_temp': 'first',
    'precipitation': 'first',
    'relative_humidity': 'first',
    'avg_rx1day': 'first',
    'avg_days_r20mm': 'first'
}).reset_index()

monthly_aggregated.columns = ['year_month', 'total_questions', 'pest_disease_questions',
                               'avg_max_temp', 'precipitation', 'relative_humidity',
                               'avg_rx1day', 'avg_days_r20mm']

monthly_aggregated['pest_disease_rate'] = (monthly_aggregated['pest_disease_questions'] / 
                                            monthly_aggregated['total_questions']) * 100

print("\nMonthly aggregated data:")
print(monthly_aggregated.head(10))

# Calculate correlations
print("\n" + "="*80)
print("CORRELATION ANALYSIS: Weather vs Question Volume")
print("="*80)

correlations = {}
for weather_var in ['avg_max_temp', 'precipitation', 'relative_humidity', 'avg_rx1day', 'avg_days_r20mm']:
    # Remove NaN values
    valid_data = monthly_aggregated[[weather_var, 'total_questions']].dropna()
    if len(valid_data) > 0:
        corr, p_value = pearsonr(valid_data[weather_var], valid_data['total_questions'])
        correlations[weather_var] = {'correlation': corr, 'p_value': p_value}

print("\nWeather Variable vs Total Question Volume:")
for var, stats in correlations.items():
    significance = "***" if stats['p_value'] < 0.001 else "**" if stats['p_value'] < 0.01 else "*" if stats['p_value'] < 0.05 else ""
    print(f"  {var:25s}: r = {stats['correlation']:7.4f}, p = {stats['p_value']:.4f} {significance}")

# Correlations with pest/disease rate
print("\nWeather Variable vs Pest/Disease Question Rate:")
pest_correlations = {}
for weather_var in ['avg_max_temp', 'precipitation', 'relative_humidity', 'avg_rx1day', 'avg_days_r20mm']:
    valid_data = monthly_aggregated[[weather_var, 'pest_disease_rate']].dropna()
    if len(valid_data) > 0:
        corr, p_value = pearsonr(valid_data[weather_var], valid_data['pest_disease_rate'])
        pest_correlations[weather_var] = {'correlation': corr, 'p_value': p_value}
        significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
        print(f"  {weather_var:25s}: r = {corr:7.4f}, p = {p_value:.4f} {significance}")



MONTHLY TIME SERIES CORRELATIONS

Monthly aggregated data:
  year_month  total_questions  pest_disease_questions  avg_max_temp  \
0    2017-11             2476                     412         27.44   
1    2017-12             7222                    1045         29.82   
2    2018-01            11031                    1336         29.77   
3    2018-02             5601                     602         31.37   
4    2018-03             7799                    1155         27.60   
5    2018-04            12396                    1877         26.26   
6    2018-05            11890                    1902         26.68   
7    2018-06            12283                    1724         26.53   
8    2018-07            24628                    3340         27.03   
9    2018-08            38991                    5392         26.92   

   precipitation  relative_humidity  avg_rx1day  avg_days_r20mm  \
0         147.35              71.43       21.13            0.90   
1          45.54        

In [14]:
print("\n" + "="*80)
print("LAG EFFECT ANALYSIS: Post-Weather Event Question Patterns")
print("="*80)

# Sort by date
monthly_aggregated_sorted = monthly_aggregated.sort_values('year_month').copy()

# Create lagged weather variables (1-3 months lag)
for lag in [1, 2, 3]:
    for weather_var in ['precipitation', 'avg_max_temp', 'relative_humidity']:
        monthly_aggregated_sorted[f'{weather_var}_lag{lag}'] = monthly_aggregated_sorted[weather_var].shift(lag)

# Calculate correlations with lagged variables
print("\nLagged Weather Correlations with Pest/Disease Questions:")
print("\nVariable                      Current Month  1-Month Lag  2-Month Lag  3-Month Lag")
print("-" * 85)

for weather_var in ['precipitation', 'avg_max_temp', 'relative_humidity']:
    correlations_row = [weather_var[:23]]
    
    # Current month
    valid_data = monthly_aggregated_sorted[[weather_var, 'pest_disease_rate']].dropna()
    if len(valid_data) > 0:
        corr, p_val = pearsonr(valid_data[weather_var], valid_data['pest_disease_rate'])
        sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
        correlations_row.append(f"{corr:6.3f}{sig:3s}")
    else:
        correlations_row.append("  N/A   ")
    
    # Lagged months
    for lag in [1, 2, 3]:
        lag_var = f'{weather_var}_lag{lag}'
        valid_data = monthly_aggregated_sorted[[lag_var, 'pest_disease_rate']].dropna()
        if len(valid_data) > 5:  # Need enough data points
            corr, p_val = pearsonr(valid_data[lag_var], valid_data['pest_disease_rate'])
            sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
            correlations_row.append(f"{corr:6.3f}{sig:3s}")
        else:
            correlations_row.append("  N/A   ")
    
    print(f"{correlations_row[0]:25s} {correlations_row[1]:13s} {correlations_row[2]:13s} {correlations_row[3]:13s} {correlations_row[4]:13s}")

print("\n* p < 0.05, ** p < 0.01, *** p < 0.001")


LAG EFFECT ANALYSIS: Post-Weather Event Question Patterns

Lagged Weather Correlations with Pest/Disease Questions:

Variable                      Current Month  1-Month Lag  2-Month Lag  3-Month Lag
-------------------------------------------------------------------------------------
precipitation              0.458**       0.439**       0.135        -0.138       
avg_max_temp              -0.291*       -0.144         0.037         0.153       
relative_humidity          0.379**       0.297*        0.012        -0.210       

* p < 0.05, ** p < 0.01, *** p < 0.001


In [15]:
print("\n" + "="*80)
print("LIVESTOCK VS CROPS: WEATHER SENSITIVITY COMPARISON")
print("="*80)

# Define livestock and crop categories
livestock_topics = ['cattle', 'chicken', 'pig', 'goat', 'sheep', 'rabbit', 'poultry']
crop_topics = ['maize', 'tomato', 'cabbage', 'bean', 'potato', 'onion', 'kale', 'coffee', 'tea']

df['category'] = 'other'
df.loc[df['dominant_topic'].isin(livestock_topics), 'category'] = 'livestock'
df.loc[df['dominant_topic'].isin(crop_topics), 'category'] = 'crop'

# Compare monthly patterns
category_monthly = df[df['category'].isin(['livestock', 'crop'])].groupby(['year_month', 'category']).size().unstack(fill_value=0)
category_monthly = category_monthly.merge(monthly_weather, on='year_month', how='left')

print("\nQuestion volume by category:")
print(f"Total Livestock questions: {(df['category'] == 'livestock').sum():,}")
print(f"Total Crop questions: {(df['category'] == 'crop').sum():,}")

# Weather sensitivity by category
print("\nRainfall Sensitivity:")
for category in ['livestock', 'crop']:
    cat_df = df[df['category'] == category]
    rain_dist = cat_df.groupby('rain_category').size()
    print(f"\n{category.capitalize()}:")
    for rain_cat in ['Low Rainfall', 'Moderate Rainfall', 'High Rainfall']:
        count = rain_dist.get(rain_cat, 0)
        pct = (count / len(cat_df)) * 100
        print(f"  {rain_cat:20s}: {pct:5.2f}%")

# Pest/disease rates by category
print("\nPest/Disease Question Rates:")
for category in ['livestock', 'crop']:
    cat_df = df[df['category'] == category]
    rate = cat_df['is_pest_disease'].mean() * 100
    print(f"{category.capitalize():15s}: {rate:5.2f}%")


LIVESTOCK VS CROPS: WEATHER SENSITIVITY COMPARISON

Question volume by category:
Total Livestock questions: 276,633
Total Crop questions: 272,160

Rainfall Sensitivity:

Livestock:
  Low Rainfall        : 27.42%
  Moderate Rainfall   : 36.88%
  High Rainfall       : 35.69%

Crop:
  Low Rainfall        : 26.08%
  Moderate Rainfall   : 36.12%
  High Rainfall       : 37.79%

Pest/Disease Question Rates:
Livestock      :  9.25%
Crop           : 16.89%


In [16]:
print("\n" + "="*80)
print("STATISTICAL TESTS: CHI-SQUARE INDEPENDENCE TESTS")
print("="*80)

# Test 1: Rain category vs Pest/Disease questions
print("\nTest 1: Rainfall Category vs Pest/Disease Questions")
contingency_rain_pest = pd.crosstab(df['rain_category'], df['is_pest_disease'])
chi2, p_value, dof, expected = chi2_contingency(contingency_rain_pest)
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p_value:.6f}")
print(f"Degrees of freedom: {dof}")
print("Interpretation:", "SIGNIFICANT relationship" if p_value < 0.05 else "No significant relationship")

# Test 2: Temperature category vs Pest/Disease questions
print("\nTest 2: Temperature Category vs Pest/Disease Questions")
contingency_temp_pest = pd.crosstab(df['temp_category'], df['is_pest_disease'])
chi2, p_value, dof, expected = chi2_contingency(contingency_temp_pest)
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p_value:.6f}")
print(f"Degrees of freedom: {dof}")
print("Interpretation:", "SIGNIFICANT relationship" if p_value < 0.05 else "No significant relationship")

# Test 3: Extreme rain vs topic distribution (top 10 topics)
print("\nTest 3: Extreme Rain vs Top Topic Distribution")
top_topics = df['dominant_topic'].value_counts().head(10).index
df_top_topics = df[df['dominant_topic'].isin(top_topics)]
contingency_extreme_topic = pd.crosstab(df_top_topics['extreme_rain'], df_top_topics['dominant_topic'])
chi2, p_value, dof, expected = chi2_contingency(contingency_extreme_topic)
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p_value:.6f}")
print(f"Degrees of freedom: {dof}")
print("Interpretation:", "SIGNIFICANT relationship" if p_value < 0.05 else "No significant relationship")



STATISTICAL TESTS: CHI-SQUARE INDEPENDENCE TESTS

Test 1: Rainfall Category vs Pest/Disease Questions
Chi-square statistic: 641.8895
P-value: 0.000000
Degrees of freedom: 2
Interpretation: SIGNIFICANT relationship

Test 2: Temperature Category vs Pest/Disease Questions
Chi-square statistic: 122.0222
P-value: 0.000000
Degrees of freedom: 2
Interpretation: SIGNIFICANT relationship

Test 3: Extreme Rain vs Top Topic Distribution
Chi-square statistic: 895.6259
P-value: 0.000000
Degrees of freedom: 9
Interpretation: SIGNIFICANT relationship
