In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [None]:
# Load data
df = pd.read_csv("crop_yield.csv")

In [None]:
# Basic feature engineering
df['rainfall_per_day'] = df['Rainfall_mm'] / (df['Days_to_Harvest'] + 1)
df['temp_rain_interaction'] = df['Rainfall_mm'] * df['Temperature_Celsius']
df['fert_irrig_score'] = df['Fertilizer_Used'].astype(int) + df['Irrigation_Used'].astype(int)
df['temp_squared'] = df['Temperature_Celsius'] ** 2
df['rainfall_squared'] = df['Rainfall_mm'] ** 2
df['days_squared'] = df['Days_to_Harvest'] ** 2
df['temp_cubed'] = df['Temperature_Celsius'] ** 3

In [None]:
# Encode categorical variables
for col in ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']:
    df[col + '_encoded'] = LabelEncoder().fit_transform(df[col])

In [None]:
# Select features and target
features = [
    'Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest',
    'Fertilizer_Used', 'Irrigation_Used',
    'Region_encoded', 'Soil_Type_encoded', 'Crop_encoded', 'Weather_Condition_encoded',
    'rainfall_per_day', 'temp_rain_interaction', 'fert_irrig_score',
    'temp_squared', 'rainfall_squared', 'days_squared', 'temp_cubed'
]
X = df[features]
y = df['Yield_tons_per_hectare']

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale features (important for Ridge)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Find best alpha with RidgeCV
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train_scaled, y_train)
alpha_opt = ridge_cv.alpha_

In [None]:
# Train Ridge Regression with best alpha
model = Ridge(alpha=alpha_opt)
model.fit(X_train_scaled, y_train)

In [None]:
# Predict and evaluate
y_pred = model.predict(X_test_scaled)
print(f"Optimal alpha: {alpha_opt}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f} tons/ha")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f} tons/ha")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred)*100:.2f}%")

Optimal alpha: 10.0
R² Score: 0.9130
MAE: 0.3996 tons/ha
RMSE: 0.5008 tons/ha
MAPE: 12.65%


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import joblib

# Define the numeric and categorical features based on the original dataframe
numeric_features = [
    'Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest'
]
categorical_features = [
    'Region', 'Soil_Type', 'Crop', 'Weather_Condition'
]

# Create the preprocessing pipeline using ColumnTransformer
preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (e.g., engineered features)
)

# Create the full pipeline
# Note: 'model' here refers to the Ridge model already trained in previous steps
pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', model)   # your trained Ridge regression model
])

# Fit the entire pipeline on the original (unscaled, unencoded) data
# This step will internally call fit_transform on preprocess and then fit on model
pipeline.fit(df[numeric_features + categorical_features], y)

# Save the complete pipeline to a .pkl file with the desired name
joblib.dump(pipeline, 'ridge_regression_pipeline_model_final.pkl')

print("Full pipeline including preprocessing and trained model saved to 'ridge_regression_pipeline_model_final.pkl'")

Full pipeline including preprocessing and trained model saved to 'ridge_regression_pipeline_model_final.pkl'


In [None]:
from google.colab import files

files.download('ridge_regression_pipeline_model_final.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>