In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LassoCV, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import joblib

In [None]:
# from google.colab import files

# Load dataset
df = pd.read_csv("crop_yield.csv")

In [None]:
# Basic feature engineering
# Removed features requiring 'Days_to_Harvest', 'Temperature_Celsius', 'Irrigation_Used' as they are not in the DataFrame.
df['annual_rainfall_squared'] = df['Annual_Rainfall'] ** 2
df['fertilizer_annual_rainfall_interaction'] = df['Fertilizer'].astype(int) * df['Annual_Rainfall']
df['fertilizer_score'] = df['Fertilizer'].astype(int)

In [None]:
# Encode categorical variables
for col in ['State', 'Season', 'Crop']:
    df[col + '_encoded'] = LabelEncoder().fit_transform(df[col])

In [None]:
# Select features and target
features = [
    'Area',
    'Production',
    'Annual_Rainfall',
    'Fertilizer',
    'Pesticide',
    'annual_rainfall_squared',
    'fertilizer_annual_rainfall_interaction',
    'fertilizer_score',
    'State_encoded',
    'Season_encoded',
    'Crop_encoded'
]
X = df[features]
y = df['Yield']

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Find the best alpha using LassoCV (5-fold cross-validation)
alphas = np.logspace(-4, 1, 50)
lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000, n_jobs=-1, random_state=42)
lasso_cv.fit(X_train_scaled, y_train)
best_alpha = lasso_cv.alpha_
print(f"Optimal alpha found: {best_alpha:.6f}")

Optimal alpha found: 1.930698


In [None]:
# Train Lasso Regression with the best alpha
model = Lasso(alpha=best_alpha, max_iter=10000, random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Predictions & Evaluation
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

def print_metrics(name, y_true, y_pred):
    print(f"\n{name} Performance:")
    print(f" R² Score: {r2_score(y_true, y_pred):.4f}")
    print(f" MAE: {mean_absolute_error(y_true, y_pred):.4f} tons/ha")
    print(f" RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f} tons/ha")
    print(f" MAPE: {mean_absolute_percentage_error(y_true, y_pred)*100:.2f}%")

print_metrics("Training", y_train, y_train_pred)
print_metrics("Testing", y_test, y_test_pred)


Training Performance:
 R² Score: 0.3237
 MAE: 140.4031 tons/ha
 RMSE: 718.7992 tons/ha
 MAPE: 284086323608024544.00%

Testing Performance:
 R² Score: 0.4021
 MAE: 138.0035 tons/ha
 RMSE: 692.1689 tons/ha
 MAPE: 283298793894942048.00%


In [None]:

# Feature selection summary
non_zero_count = np.sum(np.abs(model.coef_) > 1e-10)
zero_count = len(model.coef_) - non_zero_count
print(f"\nFeature Selection:")
print(f" Total features: {len(model.coef_)}")
print(f" Selected features (non-zero coefficients): {non_zero_count}")
print(f" Eliminated features (zero coefficients): {zero_count}")

selected_features = [features[i] for i in range(len(model.coef_)) if abs(model.coef_[i]) > 1e-10]
print(f"Top selected features: {selected_features[:10]}")


Feature Selection:
 Total features: 11
 Selected features (non-zero coefficients): 10
 Eliminated features (zero coefficients): 1
Top selected features: ['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'annual_rainfall_squared', 'fertilizer_annual_rainfall_interaction', 'State_encoded', 'Season_encoded', 'Crop_encoded']


In [None]:
from google.colab import files

# Save model, scaler and label encoders
joblib.dump(model, 'lasso_regression_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(features, 'feature_names.pkl')
joblib.dump(best_alpha, 'lasso_optimal_alpha.pkl')

print("\nSaved model and preprocessing objects as .pkl files.")

# If using Google Colab, uncomment below to download files directly
files.download('lasso_regression_model.pkl')
files.download('scaler.pkl')
files.download('feature_names.pkl')
files.download('lasso_optimal_alpha.pkl')


Saved model and preprocessing objects as .pkl files.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import joblib
from google.colab import files

# Combine all necessary objects into a dictionary
model_package = {
    'model': model,
    'scaler': scaler,
    'features': features,
    'optimal_alpha': best_alpha
}

# Save the combined package to a single .pkl file
package_filename = 'crop_yield_prediction_model.pkl'
joblib.dump(model_package, package_filename)

print(f"All model components saved to '{package_filename}'.")

# Download the combined package file
files.download(package_filename)

All model components saved to 'crop_yield_prediction_model.pkl'.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>