In [None]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import joblib

In [None]:
df = pd.read_csv("crop_yield.csv")
print(f"âœ… Dataset loaded with {df.shape[0]:,} rows and {df.shape[1]} columns")

âœ… Dataset loaded with 1,000,000 rows and 10 columns


In [None]:
print(f"Missing values: {df.isnull().sum().sum()}, Duplicates: {df.duplicated().sum()}")
print(f"Target yield mean: {df['Yield_tons_per_hectare'].mean():.2f}, std: {df['Yield_tons_per_hectare'].std():.2f}")

Missing values: 0, Duplicates: 0
Target yield mean: 4.65, std: 1.70


In [None]:
df['rainfall_per_day'] = df['Rainfall_mm'] / (df['Days_to_Harvest'] + 1)
df['fert_irrig_score'] = df['Fertilizer_Used'].astype(int) + df['Irrigation_Used'].astype(int)
df['temp_rain_product'] = df['Temperature_Celsius'] * df['Rainfall_mm']
print("âœ… Created rainfall_per_day, fert_irrig_score, temp_rain_product")

âœ… Created rainfall_per_day, fert_irrig_score, temp_rain_product


In [None]:
categorical_cols = ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le
print("âœ… Categorical variables encoded")

âœ… Categorical variables encoded


In [None]:
features = [
    'Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest',
    'Fertilizer_Used', 'Irrigation_Used',
    'Region_encoded', 'Soil_Type_encoded', 'Crop_encoded', 'Weather_Condition_encoded',
    'rainfall_per_day', 'fert_irrig_score', 'temp_rain_product'
]
X = df[features]
y = df['Yield_tons_per_hectare']



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"âœ… Split data: {X_train.shape[0]} train samples, {X_test.shape[0]} test samples")

âœ… Split data: 800000 train samples, 200000 test samples


In [None]:
baseline_model = DecisionTreeRegressor(random_state=42)
baseline_model.fit(X_train, y_train)
print(f"Baseline Train RÂ²: {r2_score(y_train, baseline_model.predict(X_train)):.4f}")
print(f"Baseline Test RÂ²: {r2_score(y_test, baseline_model.predict(X_test)):.4f}")


Baseline Train RÂ²: 1.0000
Baseline Test RÂ²: 0.8162


In [None]:
import time
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid = {
    'max_depth': [10, 15, 20],           # Removed 25 to reduce combinations
    'min_samples_split': [2, 5],         # Reduced options
    'min_samples_leaf': [1, 2],          # Reduced options
    'min_impurity_decrease': [0.0],      # Single value (most common)
    'max_features': ['sqrt']              # Only one option to simplify
}

total_combinations = (len(param_grid['max_depth']) *
                      len(param_grid['min_samples_split']) *
                      len(param_grid['min_samples_leaf']) *
                      len(param_grid['min_impurity_decrease']) *
                      len(param_grid['max_features']))

print(f"Running GridSearch with {total_combinations} parameter combinations...")

start = time.time()
grid_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid,
    cv=3,              # Reduced CV folds from 5 to 3
    n_jobs=-1,
    scoring='r2',
    verbose=1
)
grid_search.fit(X_train, y_train)
duration = (time.time() - start) / 60
print(f"GridSearch finished in {duration:.2f} minutes")
print("Best params:", grid_search.best_params_)
print(f"Best CV RÂ²: {grid_search.best_score_:.4f}")


Running GridSearch with 12 parameter combinations...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
GridSearch finished in 1.44 minutes
Best params: {'max_depth': 10, 'max_features': 'sqrt', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best CV RÂ²: 0.9013


In [None]:
model = grid_search.best_estimator_
print(f"Optimized tree depth: {model.get_depth()}, leaves: {model.get_n_leaves()}")

# ðŸ”Ÿ Evaluate model
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mape = mean_absolute_percentage_error(y_train, y_train_pred)*100
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)*100
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
cv_mean, cv_std = cv_scores.mean(), cv_scores.std()

Optimized tree depth: 10, leaves: 1021


In [None]:
print(f"\nModel Evaluation:")
print(f"Train RÂ²: {train_r2:.4f}, Test RÂ²: {test_r2:.4f}")
print(f"MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, MAPE: {test_mape:.2f}%")
print(f"Cross-Val RÂ²: {cv_mean:.4f} Â± {cv_std:.4f}")
print(f"Overfitting Gap: {train_r2 - test_r2:.4f}")


Model Evaluation:
Train RÂ²: 0.8997, Test RÂ²: 0.8978
MAE: 0.4317, RMSE: 0.5427, MAPE: 13.55%
Cross-Val RÂ²: 0.8971 Â± 0.0043
Overfitting Gap: 0.0019


In [None]:
import joblib
import pandas as pd

# Feature importance
fi = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_}).sort_values(by='Importance', ascending=False)
print("\nTop 5 Important Features:")
print(fi.head(5).to_string(index=False))

# Save model and preprocessing objects
joblib.dump(model, 'decision_tree_model.pkl')
joblib.dump(label_encoders, 'dt_label_encoders.pkl')
joblib.dump(X.columns.tolist(), 'dt_feature_names.pkl')
joblib.dump(grid_search.best_params_, 'dt_best_params.pkl')
fi.to_csv('dt_feature_importance.csv', index=False)

# Bundle all relevant artifacts into a single dictionary and save as one PKL file
all_artifacts = {
    'model': model,
    'label_encoders': label_encoders,
    'feature_names': X.columns.tolist(),
    'best_params': grid_search.best_params_,
    'feature_importance_df': fi # Include feature importance dataframe directly
}
joblib.dump(all_artifacts, 'decision_tree_model_artifacts.pkl')
print("âœ… Model and artifacts saved individually and bundled into 'decision_tree_model_artifacts.pkl'.")


Top 5 Important Features:
          Feature  Importance
temp_rain_product    0.447766
  Fertilizer_Used    0.186086
  Irrigation_Used    0.138831
      Rainfall_mm    0.131908
 rainfall_per_day    0.054626
âœ… Model and artifacts saved individually and bundled into 'decision_tree_model_artifacts.pkl'.


In [None]:
from google.colab import files
files.download('decision_tree_model_artifacts.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>