# Pipeline Creation

In [1]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
dataset_path = "/content/drive/MyDrive/AgriYield_cleaned.csv"

In [5]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [6]:
# ================== IMPORTS ==================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from catboost import CatBoostRegressor, Pool

# ================== LOAD DATA ==================
dataset_path = "/content/drive/MyDrive/AgriYield_cleaned.csv"
df_clean = pd.read_csv(dataset_path)

print("‚úÖ Dataset Loaded Successfully")
print("Shape:", df_clean.shape)
df_clean.head()


‚úÖ Dataset Loaded Successfully
Shape: (25495, 12)


Unnamed: 0,Date,Crop_Type,Soil_Type,Soil_pH,Temperature,Humidity,Wind_Speed,N,P,K,Crop_Yield,Soil_Quality
0,2014-01-01,Corn,Loamy,6.5,20.052576,79.947424,8.591577,84.0,66.0,50.0,104.87131,66.666667
1,2014-01-01,Barley,Sandy,6.75,19.751848,80.0,2.682683,50.0,40.0,30.0,58.939796,35.0
2,2014-01-01,Soybean,Peaty,5.5,16.110395,80.0,7.69607,49.5,45.0,38.5,32.970413,22.166667
3,2014-01-01,Cotton,Sandy,6.75,14.826739,80.0,10.366657,55.0,44.0,36.0,29.356115,39.375
4,2014-01-01,Tomato,Clay,6.25,18.323272,80.0,8.198084,60.0,45.0,40.0,22.221375,42.291667


In [7]:
# Detect date-like columns
date_cols = [col for col in df_clean.columns if "date" in col.lower()]

for col in date_cols:
    df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
    # Extract useful features
    df_clean[col+"_year"] = df_clean[col].dt.year
    df_clean[col+"_month"] = df_clean[col].dt.month
    df_clean[col+"_day"] = df_clean[col].dt.day

    df_clean.drop(columns=[col], inplace=True)  # remove original raw date column


In [8]:
X = df_clean.drop(columns=["Crop_Yield"])
y = df_clean["Crop_Yield"]

# Identify categorical columns
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
print("Categorical Columns:", cat_features)

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)


Categorical Columns: ['Crop_Type', 'Soil_Type']


INITIALIZE CATBOOST MODEL

In [15]:
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    loss_function='RMSE',
    random_seed=42,
    verbose=100
)

In [17]:
model.fit(X_train, y_train, cat_features=cat_features)


# ================== PREDICT ==================
y_pred = model.predict(X_test)


# ================== METRICS ==================
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Adjusted R¬≤
n = X_test.shape[0]
p = X_test.shape[1]
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)


# ================== DISPLAY RESULTS ==================
print("\nüéØ Model Performance Metrics")
print("---------------------------------")
print(f"MAE       : {mae:.4f}")
print(f"MSE       : {mse:.4f}")
print(f"RMSE      : {rmse:.4f}")
print(f"R2 Score  : {r2:.4f}")
print(f"Adj R2    : {adj_r2:.4f}")
print("---------------------------------")

0:	learn: 21.6601124	total: 18.7ms	remaining: 9.33s
100:	learn: 5.2170588	total: 2.95s	remaining: 11.7s
200:	learn: 4.6553145	total: 5.01s	remaining: 7.45s
300:	learn: 4.4943526	total: 6.55s	remaining: 4.33s
400:	learn: 4.3918829	total: 8.09s	remaining: 2s
499:	learn: 4.3014109	total: 9.62s	remaining: 0us

üéØ Model Performance Metrics
---------------------------------
MAE       : 3.2239
MSE       : 20.9802
RMSE      : 4.5804
R2 Score  : 0.9585
Adj R2    : 0.9584
---------------------------------
