In [None]:
# gen4_top8_pipeline.py
# Full pipeline: generate Gen4 synthetic dataset (8 crops), train RF & XGBoost, evaluate, save CSV.
# Requires: numpy, pandas, scikit-learn, xgboost
# Run: python gen4_top8_pipeline.py  (or run in notebook)

import numpy as np
import pandas as pd
import random
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

# ---------- 1) Parameters ----------
N_SAMPLES = 16000   # final number of synthetic rows to generate (will be balanced)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# ---------- 2) Define crop list (top 8) ----------
CROPS = ["Rice", "Maize", "Groundnut", "Cotton", "Sugarcane", "Wheat", "Mustard", "Barley"]
SEASONS = ["Kharif", "Rabi", "Summer"]

# ---------- 3) Utility random function ----------
def rand(a, b):
    return np.random.uniform(a, b)

# ---------- 4) Strong non-overlapping rules (Gen4) ----------
def assign_crop(season, rainfall, temp, clay, sand, nitrogen):
    # Enforce strict, non-overlapping zones (priority order)
    # RABI
    if season == "Rabi":
        # Wheat: very cool, very low rain
        if rainfall < 450 and temp < 20:
            return "Wheat"
        # Barley: cool, low rain, sandy
        if rainfall < 600 and 10 <= temp <= 20 and sand > 40:
            return "Barley"
        # Mustard: moderate rain and sandy-loam
        if 450 <= rainfall <= 750 and 15 <= temp <= 25 and 30 <= sand <= 60:
            return "Mustard"
        # fallback for rabi (shouldn't happen often)
        return "Wheat"

    # KHARIF
    if season == "Kharif":
        # Rice: extremely high rainfall + clayey
        if rainfall > 1600 and clay > 30 and 25 <= temp <= 33:
            return "Rice"
        # Sugarcane: high N + warm + high rainfall
        if nitrogen > 140 and 1100 < rainfall < 2000 and 24 <= temp <= 32:
            return "Sugarcane"
        # Maize: mid-high rainfall, moderate temp
        if 900 <= rainfall <= 1600 and 20 <= temp <= 32:
            return "Maize"
        # Groundnut: sandy (>45), moderate rainfall
        if sand > 45 and 700 < rainfall < 1400 and 26 <= temp <= 34:
            return "Groundnut"
        # Cotton: hot and relatively dry
        if temp >= 34 and rainfall < 900:
            return "Cotton"
        # fallback for kharif
        return "Maize"

    # SUMMER
    if season == "Summer":
        # Maize: summer maize
        if 800 <= rainfall <= 1500 and 28 <= temp <= 38:
            return "Maize"
        # Groundnut: sandy + warm
        if sand > 45 and 500 < rainfall < 1200 and 28 <= temp <= 36:
            return "Groundnut"
        # Cotton: very hot + dry
        if temp > 35 and rainfall < 1000:
            return "Cotton"
        # fallback
        return "Maize"

    # default fallback
    return random.choice(CROPS)

# ---------- 5) Generate synthetic rows ----------
rows = []
# We'll generate many rows and then balance per class.
n_gen = int(N_SAMPLES * 1.3)  # generate a bit more and then balance

for i in range(n_gen):
    season = random.choice(SEASONS)

    # Soil features (clean percent-based using realistic ranges)
    ph = rand(5.0, 8.5)                     # realistic pH
    nitrogen = rand(10, 250)                # kg/ha style range
    clay = rand(10, 60)                     # %
    sand = rand(10, 70)                     # %
    silt = max(5, 100 - clay - sand)        # ensure positive silt
    soc = rand(0.2, 3.5)                    # %
    bdod = rand(0.5, 1.8)
    cec = rand(5, 50)

    # Climate features
    rainfall = rand(200, 3000)              # mm
    temp = rand(10, 40)                     # degC
    humidity = rand(30, 95)
    solar = rand(80, 400)

    # Assign crop using the strict Gen4 rules
    crop = assign_crop(season, rainfall, temp, clay, sand, nitrogen)

    rows.append({
        "pH": ph,
        "Nitrogen": nitrogen,
        "Clay": clay,
        "Sand": sand,
        "Silt": silt,
        "SOC": soc,
        "BDOD": bdod,
        "CEC": cec,
        "Rainfall": rainfall,
        "Temperature": temp,
        "Humidity": humidity,
        "Solar": solar,
        "Season": season,
        "Crop": crop
    })

df_syn = pd.DataFrame(rows)

# ---------- 6) Balance dataset: equal samples per crop ----------
per_class = N_SAMPLES // len(CROPS)
balanced_rows = []
for crop in CROPS:
    df_c = df_syn[df_syn["Crop"] == crop]
    if len(df_c) >= per_class:
        sampled = df_c.sample(per_class, random_state=RANDOM_STATE)
    else:
        # If not enough rows for this crop, generate extra specifically by sampling conditions within that crop's zone
        needed = per_class - len(df_c)
        sampled = df_c.copy()
        extras = []
        attempts = 0
        while len(extras) < needed and attempts < needed * 10:
            attempts += 1
            season = random.choice(SEASONS)
            ph = rand(5.0, 8.5)
            nitrogen = rand(10, 250)
            clay = rand(10, 60)
            sand = rand(10, 70)
            silt = max(5, 100 - clay - sand)
            soc = rand(0.2, 3.5)
            bdod = rand(0.5, 1.8)
            cec = rand(5, 50)
            rainfall = rand(200, 3000)
            temp = rand(10, 40)
            humidity = rand(30, 95)
            solar = rand(80, 400)
            crop_try = assign_crop(season, rainfall, temp, clay, sand, nitrogen)
            if crop_try == crop:
                extras.append({
                    "pH": ph, "Nitrogen": nitrogen, "Clay": clay, "Sand": sand, "Silt": silt,
                    "SOC": soc, "BDOD": bdod, "CEC": cec, "Rainfall": rainfall,
                    "Temperature": temp, "Humidity": humidity, "Solar": solar,
                    "Season": season, "Crop": crop
                })
        if len(extras) > 0:
            sampled = pd.concat([sampled, pd.DataFrame(extras)])
        # if still short, repeat with resampling
        if len(sampled) < per_class:
            sampled = sampled.sample(per_class, replace=True, random_state=RANDOM_STATE)
    balanced_rows.append(sampled)

df_balanced = pd.concat(balanced_rows).reset_index(drop=True)

# Shuffle
df_balanced = df_balanced.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Save dataset
df_balanced.to_csv("synthetic_merged.csv", index=False)
print("Synthetic dataset saved to synthetic_merged.csv")
print("Class counts:", Counter(df_balanced["Crop"]))

Synthetic dataset saved to synthetic_merged.csv
Class counts: Counter({'Sugarcane': 2000, 'Groundnut': 2000, 'Cotton': 2000, 'Rice': 2000, 'Wheat': 2000, 'Barley': 2000, 'Maize': 2000, 'Mustard': 2000})


In [None]:
# -----------------------------------------------------------
# CROP SELECTION MODEL (Soil + Climate â†’ Crop Recommendation)
# -----------------------------------------------------------

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Load your merged dataset
df = pd.read_csv("/content/synthetic_merged.csv")

"""features = [
    "phh2o","nitrogen","cec","soc","bdod",
    "clay","silt","sand","ocd",
    "T2M","T2M_MIN","T2M_MAX","PRECTOTCORR","ALLSKY_SFC_SW_DWN",
    "Season","District"
]
"""

# -------------------------------
# 2. Define Input and Target Columns
# -------------------------------

# Target column
y = df["Crop"]            # Crop is your output

# Input features (use all except target)
X=df.drop(columns=["Crop"])

# Identify categorical & numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# --------------------------------
# 3. Preprocessing Blocks
# --------------------------------

preprocess = ColumnTransformer([
    ("num", StandardScaler(), numerical_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])

# --------------------------------
# 4. Build Pipeline (Preprocessing + Model)
# --------------------------------

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        random_state=42
    ))
])

# --------------------------------
# 5. Train-Test Split
# --------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --------------------------------
# 6. Train the Model
# --------------------------------

model.fit(X_train, y_train)

# --------------------------------
# 7. Evaluate Model
# --------------------------------

y_pred = model.predict(X_test)

print("\nðŸŒ¾ Crop Selection Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))





ðŸŒ¾ Crop Selection Model Accuracy: 0.9959375

Classification Report:
               precision    recall  f1-score   support

      Barley       1.00      1.00      1.00       404
      Cotton       0.99      1.00      1.00       384
   Groundnut       0.99      1.00      0.99       369
       Maize       1.00      0.97      0.99       424
     Mustard       1.00      1.00      1.00       367
        Rice       1.00      1.00      1.00       440
   Sugarcane       0.99      1.00      1.00       412
       Wheat       1.00      1.00      1.00       400

    accuracy                           1.00      3200
   macro avg       1.00      1.00      1.00      3200
weighted avg       1.00      1.00      1.00      3200



In [None]:
df["Crop"].unique()

array(['Sugarcane', 'Groundnut', 'Cotton', 'Rice', 'Wheat', 'Barley',
       'Maize', 'Mustard'], dtype=object)

In [None]:
import pickle

with open("crop_selection_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
import pandas as pd

# The example data must match the features used for training the model.
# The model was trained on these columns from synthetic_df:
# 'pH', 'Nitrogen', 'Clay', 'Sand', 'Silt', 'SOC', 'BDOD', 'CEC', 'Rainfall', 'Temperature', 'Humidity', 'Solar'

example_data = pd.DataFrame({
    "Season": ["Summer"],
    "pH": [68.0],
    "Nitrogen": [250.0],
    "Clay": [35.0],
    "Sand": [35.0],
    "Silt": [30.0],
    "SOC": [200.0],
    "BDOD": [130.0],
    "CEC": [280.0],
    "Rainfall": [1500.0],
    "Temperature": [28.0],
    "Humidity": [75.0],
    "Solar": [250.0]
})

print("\nRecommended Crop:", model.predict(example_data))


Recommended Crop: ['Maize']
