In [1]:
# # Intelligent Water Recommendation System: A Gradient Boosting Approach
# 
# ### Project Goal
# To develop a highly accurate regression model that predicts the optimal volume of water required for a single irrigation event, based on the crop type, soil characteristics, and seasonal water needs.
# 
# ### Novelty for Research
# Instead of a simple lookup, we will use a **Gradient Boosting Regressor**. This advanced ensemble technique builds a strong predictive model by iteratively adding weak learner models (typically decision trees). Each new tree is trained to correct the errors of the combined existing ensemble, making it exceptionally powerful for tabular data.
# 
# This approach allows the model to learn complex, non-linear relationships between the features (like crop and soil type) and the target (water volume), providing a more dynamic and intelligent recommendation than a static schedule. This methodology is well-suited for a research paper on smart irrigation systems.

# ## Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

print("Libraries imported successfully!")

# ## Step 2: Load and Prepare the Dataset
# We load the `Crop_Groundwater_Irrigation_Schedule.csv` file. The target we want to predict is `Volume per Irrigation (L/ha)`.

# Load the dataset
try:
    df = pd.read_csv('dataset/Crop_Groundwater_Irrigation_Schedule.csv')
    print("Irrigation schedule dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'Crop_Groundwater_Irrigation_Schedule.csv' not found. Please ensure the file is in the 'dataset/' directory.")
    df = None

if df is not None:
    # Drop columns that are not useful for this specific prediction
    df = df.drop(['Recommended Method', 'Pump Runtime per Irrigation (hrs/ha) at 15 L/s'], axis=1)

    # Display first 5 rows and info
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
    print("\nDataset Information:")
    df.info()

    # ## Step 3: Define Features, Target, and Preprocessing Pipeline
    # We will build a robust preprocessing pipeline to handle both text-based (categorical) and numerical data automatically. This is the same best-practice approach used for the fertilizer model.

    # Separate features (X) and target (y)
    X = df.drop('Volume per Irrigation (L/ha)', axis=1)
    y = df['Volume per Irrigation (L/ha)']

    # Identify numerical and categorical features
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=object).columns.tolist()

    print(f"\nNumerical features: {numerical_features}")
    print(f"Categorical features: {categorical_features}")

    # Create preprocessing pipelines for both feature types
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a preprocessor object using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # ## Step 4: Build, Train, and Evaluate the Gradient Boosting Model
    # We will now define our model and wrap it in a full pipeline with the preprocessor.

    # Define the model with optimized parameters for high accuracy
    gbr = GradientBoostingRegressor(
        n_estimators=500,       # More trees can improve accuracy
        learning_rate=0.05,     # A smaller learning rate often leads to better results
        max_depth=4,            # Deeper trees can capture more complexity
        random_state=42
    )
    
    # Create the full model pipeline
    water_model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', gbr)
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("\nTraining the Gradient Boosting Regressor model...")
    # Train the pipeline
    water_model_pipeline.fit(X_train, y_train)
    print("Training complete!")

    # Make predictions
    y_pred = water_model_pipeline.predict(X_test)

    # Evaluate the model's performance
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"\nModel Evaluation:")
    print(f"R-squared (R2) Score: {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f} Litres/ha")
    print("\nInterpretation: An R2 score close to 1.0 indicates that the model explains a large portion of the variance in the data, making it highly reliable.")

    # ## Step 5: Save the Trained Pipeline for Web Deployment
    # We save the entire `water_model_pipeline` to a single file. This file contains the preprocessor and the trained model, ready for your web app.

    joblib.dump(water_model_pipeline, 'water_recommendation_pipeline.pkl')
    print("\nComplete water recommendation pipeline saved as 'water_recommendation_pipeline.pkl'")


Libraries imported successfully!
Irrigation schedule dataset loaded successfully!

First 5 rows of the dataset:
        Crop Typical Soil Type  Seasonal Water Requirement (mm/season)  \
0      apple              Loam                                    1050   
1     banana         Clay Loam                                    1700   
2  blackgram              Loam                                     475   
3   chickpea             Sandy                                     350   
4    coconut             Sandy                                    2000   

   Season Length (days)  Irrigation Frequency (days)  \
0                   365                            9   
1                   365                           12   
2                    90                            9   
3                    90                            4   
4                   365                            4   

   Number of Irrigations (per season)  Volume per Irrigation (L/ha)  
0                                  4