In [1]:
import pandas as pd
df = pd.read_csv('../dataset/final/merged_dataset_final_all_pp.csv')

In [2]:
import pandas as pd

# Calculate dataset-based thresholds
tp7_threshold = df['tp_7d_cum'].quantile(0.75)      # top 25% cumulative rainfall
tp_lag3_threshold = df['tp_lag3'].quantile(0.25)    # lowest 25% dry period
tp_mm_threshold = df['tp_mm'].quantile(0.75)        # sudden rainfall top 25%
heat_threshold = 27                                   # 9 AM heatwave threshold in KTM
low_n_threshold = df['total_nitrogen'].quantile(0.25)  # low N
low_ph_threshold = df['ph'].quantile(0.25)             # acidic soil

In [3]:
def rice_blast_risk_pct(row):
    risk = 0
    # Temperature favorable
    if 24 <= row['t2m_C'] <= 30:
        risk += 1
    if row['t2m_C'] > heat_threshold:
        risk += 1
    # Rainfall / moisture
    if row['tp_7d_cum'] > tp7_threshold:
        risk += 1
    if row['consec_rain_days'] >= 2:
        risk += 1
    if row['tp_lag3'] < tp_lag3_threshold and row['tp_mm'] > tp_mm_threshold:
        risk += 2  # sudden rain after dry period
    # Soil
    if row['total_nitrogen'] < low_n_threshold:
        risk += 1
    if row['ph'] < low_ph_threshold:
        risk += 1

    # Risk classification
    if risk >= 3:
        return "High"
    elif risk >= 2:
        return "Moderate"
    else:
        return "Low"

In [4]:
# 2️⃣ Bacterial Leaf Blight - percentile-based
def blb_risk_pct(row):
    risk = 0
    if 24 <= row['t2m_C'] <= 34:
        risk += 1
    if row['t2m_C'] > heat_threshold:
        risk += 1
    if row['tp_7d_cum'] > tp7_threshold:
        risk += 1
    if row['consec_rain_days'] >= 2:
        risk += 1
    if row['tp_lag3'] < tp_lag3_threshold and row['tp_mm'] > tp_mm_threshold:
        risk += 2
    if row['total_nitrogen'] < low_n_threshold:
        risk += 1
    if row['ph'] < low_ph_threshold:
        risk += 1

    if risk >= 3:
        return "High"
    elif risk >= 2:
        return "Moderate"
    else:
        return "Low"

In [5]:
# 3️⃣ Sheath Blight - percentile-based
def sheath_blight_risk_pct(row):
    risk = 0
    if row['t2m_C'] >= 24:
        risk += 1
    if row['t2m_C'] > heat_threshold:
        risk += 1
    if row['consec_rain_days'] >= 2:
        risk += 1
    if row['tp_7d_cum'] > tp7_threshold:
        risk += 1
    if row['tp_lag3'] > tp_lag3_threshold:
        risk += 2  # standing water proxy
    if row['total_nitrogen'] < low_n_threshold:
        risk += 1

    if risk >= 3:
        return "High"
    elif risk >= 2:
        return "Moderate"
    else:
        return "Low"

In [6]:
import pandas as pd
df = pd.read_csv('../dataset/final/merged_dataset_final_all_pp.csv')

In [7]:
# Apply to dataset
df['blast_risk'] = df.apply(rice_blast_risk_pct, axis=1)
df['blb_risk'] = df.apply(blb_risk_pct, axis=1)
df['sheath_risk'] = df.apply(sheath_blight_risk_pct, axis=1)

# Composite rice disease risk
def rice_disease_risk_pct(row):
    if "High" in [row['blast_risk'], row['blb_risk'], row['sheath_risk']]:
        return "High"
    elif "Moderate" in [row['blast_risk'], row['blb_risk'], row['sheath_risk']]:
        return "Moderate"
    else:
        return "Low"

df['rice_disease_risk'] = df.apply(rice_disease_risk_pct, axis=1)

In [8]:
df.columns

Index(['latitude', 'longitude', 'nearest_lat', 'nearest_lon', 'distance',
       'year', 'month', 'day', 'tp_mm', 't2m_C', 'anomaly_T2m_C',
       'heat_stress_proxy', 'tp_7d_cum', 'tp_14d_cum', 'tp_7d_avg',
       'consec_rain_days', 'tp_lag1', 'tp_lag3', 'tp_lag7', 'heavy_rain',
       'month_sin', 'month_cos', 'heat_proxy', 'heat_next_day',
       'heat_next_2days', 'heat_next_3days', 'tp_anomaly', 'tp_std_anomaly',
       'heatwave_flag', 'next_day_match', 'next_2days_match',
       'next_3days_match', 'lat', 'lon', 'ph', 'organic_matter',
       'total_nitrogen', 'potassium', 'p2o5', 'boron', 'zinc', 'sand', 'clay',
       'slit', 'parentsoil', 'province', 'district', 'palika', 'crop',
       'variety', 'UREA1', 'UREA2', 'UREA3', 'DAP', 'MOP', 'organic',
       'boron_fert', 'palika_num', 'blast_risk', 'blb_risk', 'sheath_risk',
       'rice_disease_risk'],
      dtype='object')

In [9]:
df.head()

Unnamed: 0,latitude,longitude,nearest_lat,nearest_lon,distance,year,month,day,tp_mm,t2m_C,...,UREA3,DAP,MOP,organic,boron_fert,palika_num,blast_risk,blb_risk,sheath_risk,rice_disease_risk
0,27.5,85.0,27.652,85.005,0.152082,2021,1,1,0.0,19.51,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
1,27.5,85.0,27.652,85.005,0.152082,2021,1,2,0.0,20.18,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
2,27.5,85.0,27.652,85.005,0.152082,2021,1,3,0.0,20.59,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
3,27.5,85.0,27.652,85.005,0.152082,2021,1,4,0.0,18.81,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
4,27.5,85.0,27.652,85.005,0.152082,2021,1,5,0.0,20.24,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low


In [10]:
# For each disease risk column
columns = ['blast_risk', 'blb_risk', 'sheath_risk', 'rice_disease_risk']

for col in columns:
    print(f"\nCounts for {col}:")
    print(df[col].value_counts())



Counts for blast_risk:
blast_risk
Low         18609
Moderate     5395
High         2471
Name: count, dtype: int64

Counts for blb_risk:
blb_risk
Low         18280
Moderate     5627
High         2568
Name: count, dtype: int64

Counts for sheath_risk:
sheath_risk
Low         10923
High         8299
Moderate     7253
Name: count, dtype: int64

Counts for rice_disease_risk:
rice_disease_risk
Low         10084
High         8703
Moderate     7688
Name: count, dtype: int64


In [11]:
df['tp_7d_cum'].max()

30.55906266

In [12]:
import folium
from IPython.display import display

# Palika-level mean coordinates
palika_coords = df.groupby('palika')[['nearest_lat','nearest_lon']].mean().reset_index()

# Risk counts per Palika
palika_risk = df.groupby('palika')['rice_disease_risk'].value_counts().unstack(fill_value=0).reset_index()

# Merge coordinates and risk
palika_map_df = palika_coords.merge(palika_risk, on='palika', how='left')

# Function to pick color based on risk
def risk_color(risk):
    if risk == 'High':
        return 'red'
    elif risk == 'Moderate':
        return 'orange'
    else:
        return 'green'

# Create Folium map
m = folium.Map(location=[27.7, 85.3], zoom_start=10)

# Add Palika markers
for _, row in palika_map_df.iterrows():
    if row['High'] > 0:
        risk_level = 'High'
    elif row['Moderate'] > 0:
        risk_level = 'Moderate'
    else:
        risk_level = 'Low'
        
    folium.CircleMarker(
        location=[row['nearest_lat'], row['nearest_lon']],
        radius=10,
        color=risk_color(risk_level),
        fill=True,
        fill_color=risk_color(risk_level),
        fill_opacity=0.7,
        popup=f"{row['palika']}\nHigh: {row['High']}, Moderate: {row['Moderate']}, Low: {row['Low']}\nRisk: {risk_level}"
    ).add_to(m)

# Display map directly in notebook
display(m)


In [13]:
df

Unnamed: 0,latitude,longitude,nearest_lat,nearest_lon,distance,year,month,day,tp_mm,t2m_C,...,UREA3,DAP,MOP,organic,boron_fert,palika_num,blast_risk,blb_risk,sheath_risk,rice_disease_risk
0,27.5,85.0,27.652,85.005,0.152082,2021,1,1,0.0,19.51,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
1,27.5,85.0,27.652,85.005,0.152082,2021,1,2,0.0,20.18,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
2,27.5,85.0,27.652,85.005,0.152082,2021,1,3,0.0,20.59,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
3,27.5,85.0,27.652,85.005,0.152082,2021,1,4,0.0,18.81,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
4,27.5,85.0,27.652,85.005,0.152082,2021,1,5,0.0,20.24,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26470,28.0,86.0,27.892,85.476,0.535014,2025,10,27,0.0,14.62,...,1.81,2.17,1.67,6.0,1.0,,Low,Low,Low,Low
26471,28.0,86.0,27.892,85.476,0.535014,2025,10,28,0.0,14.62,...,1.81,2.17,1.67,6.0,1.0,,Low,Low,Low,Low
26472,28.0,86.0,27.892,85.476,0.535014,2025,10,29,0.0,14.62,...,1.81,2.17,1.67,6.0,1.0,,Low,Low,Low,Low
26473,28.0,86.0,27.892,85.476,0.535014,2025,10,30,0.0,14.62,...,1.81,2.17,1.67,6.0,1.0,,Low,Low,Low,Low


In [14]:
for col in df.columns:
    print(f"{col}: {df[col].max()}")


latitude: 28.0
longitude: 86.0
nearest_lat: 27.894
nearest_lon: 85.957
distance: 0.9064325678173808
year: 2025
month: 12
day: 31
tp_mm: 15.577793
t2m_C: 35.37
anomaly_T2m_C: 16.552525812945714
heat_stress_proxy: 1
tp_7d_cum: 30.55906266
tp_14d_cum: 44.5885656
tp_7d_avg: 4.36558038
consec_rain_days: 7
tp_lag1: 15.577793
tp_lag3: 15.577793
tp_lag7: 15.577793
heavy_rain: 1
month_sin: 1.0
month_cos: 1.0
heat_proxy: 1
heat_next_day: 1
heat_next_2days: 1
heat_next_3days: 1
tp_anomaly: 14.652771112040922
tp_std_anomaly: 19.85568564291748
heatwave_flag: 1
next_day_match: True
next_2days_match: True
next_3days_match: True
lat: 27.894
lon: 85.957
ph: 6.23
organic_matter: 5.24
total_nitrogen: 0.24
potassium: 395.73
p2o5: 246.53
boron: 1.3
zinc: 2.75
sand: 57.16
clay: 29.74
slit: 56.61
parentsoil: 3.0
province: Bagmati
district: Sindhupalchok
palika: Tripurasundari Gaunpalika
crop: 0.0
variety: 1.0
UREA1: 2.21
UREA2: 3.62
UREA3: 3.62
DAP: 3.62
MOP: 3.34
organic: 6.0
boron_fert: 1.0
palika_num: 15.

In [15]:
df

Unnamed: 0,latitude,longitude,nearest_lat,nearest_lon,distance,year,month,day,tp_mm,t2m_C,...,UREA3,DAP,MOP,organic,boron_fert,palika_num,blast_risk,blb_risk,sheath_risk,rice_disease_risk
0,27.5,85.0,27.652,85.005,0.152082,2021,1,1,0.0,19.51,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
1,27.5,85.0,27.652,85.005,0.152082,2021,1,2,0.0,20.18,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
2,27.5,85.0,27.652,85.005,0.152082,2021,1,3,0.0,20.59,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
3,27.5,85.0,27.652,85.005,0.152082,2021,1,4,0.0,18.81,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
4,27.5,85.0,27.652,85.005,0.152082,2021,1,5,0.0,20.24,...,1.93,0.54,1.67,6.0,1.0,1.0,Low,Low,Low,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26470,28.0,86.0,27.892,85.476,0.535014,2025,10,27,0.0,14.62,...,1.81,2.17,1.67,6.0,1.0,,Low,Low,Low,Low
26471,28.0,86.0,27.892,85.476,0.535014,2025,10,28,0.0,14.62,...,1.81,2.17,1.67,6.0,1.0,,Low,Low,Low,Low
26472,28.0,86.0,27.892,85.476,0.535014,2025,10,29,0.0,14.62,...,1.81,2.17,1.67,6.0,1.0,,Low,Low,Low,Low
26473,28.0,86.0,27.892,85.476,0.535014,2025,10,30,0.0,14.62,...,1.81,2.17,1.67,6.0,1.0,,Low,Low,Low,Low


In [16]:
df.drop(columns='palika_num', inplace=True)
df.drop(columns='district', inplace=True)
df.drop(columns='province', inplace=True)
df.drop(columns='palika', inplace=True)




In [17]:
null_cols = df.columns[df.isnull().any()]
df[null_cols].isnull().sum()


Series([], dtype: float64)

In [18]:
df.dtypes

latitude             float64
longitude            float64
nearest_lat          float64
nearest_lon          float64
distance             float64
year                   int64
month                  int64
day                    int64
tp_mm                float64
t2m_C                float64
anomaly_T2m_C        float64
heat_stress_proxy      int64
tp_7d_cum            float64
tp_14d_cum           float64
tp_7d_avg            float64
consec_rain_days       int64
tp_lag1              float64
tp_lag3              float64
tp_lag7              float64
heavy_rain             int64
month_sin            float64
month_cos            float64
heat_proxy             int64
heat_next_day          int64
heat_next_2days        int64
heat_next_3days        int64
tp_anomaly           float64
tp_std_anomaly       float64
heatwave_flag          int64
next_day_match          bool
next_2days_match        bool
next_3days_match        bool
lat                  float64
lon                  float64
ph            

In [19]:
# from sklearn.preprocessing import StandardScaler

# # Separate features and scale numeric columns
# X = df.drop(columns=risk_cols)
# y_dict = {col: df[col] for col in risk_cols}  # store y separately

# numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
# scaler = StandardScaler()
# X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [20]:
# from sklearn.model_selection import train_test_split

# # Use same split for all three diseases
# X_train, X_test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)

# y_train_dict = {}
# y_test_dict = {}
# for col in risk_cols:
#     y_train_dict[col], y_test_dict[col] = train_test_split(
#         y_dict[col], test_size=0.2, random_state=42, stratify=y_dict[col]
#     )


In [21]:
# from sklearn.ensemble import RandomForestClassifier

# rf_models = {}
# for col in risk_cols:
#     print(f"\n=== Training for {col} ===")
#     rf = RandomForestClassifier(
#         n_estimators=500,  # more trees for better training
#         max_depth=None,    # let it grow fully
#         random_state=42,
#         class_weight='balanced'
#     )
#     rf.fit(X_train, y_train_dict[col])
#     rf_models[col] = rf


In [22]:
# from sklearn.metrics import classification_report

# for col in risk_cols:
#     y_pred = rf_models[col].predict(X_test)
#     print(f"\n=== Classification Report for {col} ===")
#     print(classification_report(y_test_dict[col], y_pred))


=== Classification Report for blast_risk ===
              precision    recall  f1-score   support

           0       0.70      0.96      0.81      3722
           1       0.18      0.03      0.05      1079
           2       0.16      0.01      0.03       494

    accuracy                           0.68      5295
   macro avg       0.35      0.33      0.30      5295
weighted avg       0.55      0.68      0.58      5295


=== Classification Report for blb_risk ===
              precision    recall  f1-score   support

           0       0.69      0.96      0.80      3656
           1       0.21      0.03      0.05      1125
           2       0.12      0.01      0.02       514

    accuracy                           0.67      5295
   macro avg       0.34      0.33      0.29      5295
weighted avg       0.53      0.67      0.57      5295


=== Classification Report for sheath_risk ===
              precision    recall  f1-score   support

           0       0.41      0.57      0.48      2185
           1       0.28      0.19      0.23      1450
           2       0.33      0.25      0.29      1660

    accuracy                           0.37      5295
   macro avg       0.34      0.34      0.33      5295
weighted avg       0.35      0.37      0.35      5295


=== Classification Report for rice_disease_risk ===
              precision    recall  f1-score   support

           0       0.36      0.46      0.41      2017
           1       0.28      0.21      0.24      1538
           2       0.33      0.30      0.31      1740

    accuracy                           0.33      5295
   macro avg       0.32      0.32      0.32      5295
weighted avg       0.33      0.33      0.33      5295

In [23]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE


In [24]:
import pandas as pd

# Define risk mapping
label_map = {'Low': 0, 'Moderate': 1, 'High': 2}
risk_cols = ['blast_risk', 'blb_risk', 'sheath_risk', 'rice_disease_risk']

# Map string labels to numeric
for col in risk_cols:
    df[col] = df[col].map(label_map)


In [25]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Features
X = df.drop(columns=risk_cols)
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

def train_disease_model_cv(X, y, disease_name, n_splits=5):
    print(f"\n=== Training for {disease_name} ===")
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    f1_scores = []
    acc_scores = []

    fold = 1
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Scale numeric features based on training set
        scaler = StandardScaler()
        X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
        X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
        
        # Apply SMOTE only on training data
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        
        # Train Random Forest with controlled depth to reduce overfitting
        rf = RandomForestClassifier(
            n_estimators=300,
            max_depth=10,
            min_samples_split=10,
            min_samples_leaf=5,
            class_weight='balanced',
            random_state=42
        )
        rf.fit(X_train_res, y_train_res)
        
        # Predict and evaluate
        y_pred = rf.predict(X_test)
        f1 = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']
        acc = accuracy_score(y_test, y_pred)
        
        f1_scores.append(f1)
        acc_scores.append(acc)
        
        print(f"\n--- Fold {fold} ---")
        print(classification_report(y_test, y_pred))
        print(f"Accuracy: {acc:.4f} | Weighted F1: {f1:.4f}")
        fold += 1
        
    print(f"\n=== {disease_name} Summary ===")
    print(f"Mean Accuracy: {np.mean(acc_scores):.4f}")
    print(f"Mean Weighted F1: {np.mean(f1_scores):.4f}")
    
    return rf

# =========================
# Train models for the three diseases
models = {}
for disease in risk_cols[:3]:  # Only first three: blast_risk, blb_risk, sheath_risk
    y = df[disease]
    model = train_disease_model_cv(X, y, disease)
    models[disease] = model



=== Training for blast_risk ===

--- Fold 1 ---
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      3722
           1       0.98      0.99      0.98      1079
           2       0.99      0.99      0.99       494

    accuracy                           0.99      5295
   macro avg       0.99      0.99      0.99      5295
weighted avg       0.99      0.99      0.99      5295

Accuracy: 0.9936 | Weighted F1: 0.9936

--- Fold 2 ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3722
           1       0.98      0.99      0.98      1079
           2       0.99      0.99      0.99       494

    accuracy                           0.99      5295
   macro avg       0.99      0.99      0.99      5295
weighted avg       0.99      0.99      0.99      5295

Accuracy: 0.9934 | Weighted F1: 0.9934

--- Fold 3 ---
              precision    recall  f1-score   support

           0       1.00 

In [27]:
df.columns

Index(['latitude', 'longitude', 'nearest_lat', 'nearest_lon', 'distance',
       'year', 'month', 'day', 'tp_mm', 't2m_C', 'anomaly_T2m_C',
       'heat_stress_proxy', 'tp_7d_cum', 'tp_14d_cum', 'tp_7d_avg',
       'consec_rain_days', 'tp_lag1', 'tp_lag3', 'tp_lag7', 'heavy_rain',
       'month_sin', 'month_cos', 'heat_proxy', 'heat_next_day',
       'heat_next_2days', 'heat_next_3days', 'tp_anomaly', 'tp_std_anomaly',
       'heatwave_flag', 'next_day_match', 'next_2days_match',
       'next_3days_match', 'lat', 'lon', 'ph', 'organic_matter',
       'total_nitrogen', 'potassium', 'p2o5', 'boron', 'zinc', 'sand', 'clay',
       'slit', 'parentsoil', 'crop', 'variety', 'UREA1', 'UREA2', 'UREA3',
       'DAP', 'MOP', 'organic', 'boron_fert', 'blast_risk', 'blb_risk',
       'sheath_risk', 'rice_disease_risk'],
      dtype='object')

lat, lon, nearest lat, distance, year month, day, tpmm, t2mc, lat lon, ph, organic matter, and soil properties are the original data