In [None]:
### AI in Agriculture: Crop Yield Prediction using Machine Learning

# **Workshop Notebook**
# This notebook will guide you through predicting crop yield using various Machine Learning models.

# **Step 2: Import required libraries**
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# Dataset Information
For this hands-on session, we'll use a synthetic crop yield prediction dataset that we'll generate during the workshop. This approach eliminates the need for downloads and ensures consistent data for all participants.
The dataset includes these features:

* Rainfall (mm)
* Average temperature (°C)
* Pesticide usage (kg/hectare)
* Soil quality (1-10 scale)
* Fertilizer usage (kg/hectare)
* Crop
* Yield (bushels per acre) - target variable

In [None]:
# **Step 3: Generate Synthetic Crop Yield Dataset**
np.random.seed(42)
n_samples = 1000

# Features: Simulating environmental and agricultural factors
soil_quality = np.random.uniform(1, 10, n_samples)  # Soil quality score (1-10)
rainfall = np.random.uniform(50, 300, n_samples)  # Annual rainfall in cm
temperature = np.random.uniform(10, 35, n_samples)  # Avg temperature in Celsius
fertilizer_usage = np.random.uniform(50, 200, n_samples)  # Kg per hectare
crop_type = np.random.choice(["Wheat", "Rice", "Maize", "Barley"], n_samples)

# Encode categorical crop_type feature
crop_mapping = {"Wheat": 0, "Rice": 1, "Maize": 2, "Barley": 3}
crop_encoded = np.array([crop_mapping[crop] for crop in crop_type])

# Target variable: Crop Yield (tons per hectare) with added noise
yield_amount = (
    2.5 * soil_quality +
    0.05 * rainfall -
    0.1 * temperature +
    0.03 * fertilizer_usage +
    1.5 * crop_encoded +
    np.random.normal(0, 2, n_samples)  # Adding some noise
)

# Create DataFrame
df = pd.DataFrame({
    "Soil_Quality": soil_quality,
    "Rainfall": rainfall,
    "Temperature": temperature,
    "Fertilizer_Usage": fertilizer_usage,
    "Crop_Type": crop_type,
    "Crop_Type_Encoded": crop_encoded,
    "Crop_Yield": yield_amount
})


In [None]:
# **Step 4: Data Visualization**
plt.figure(figsize=(8, 6))
sns._______(df["Crop_Yield"], bins=30, kde=True)
plt.title("Distribution of Crop Yield")
plt.xlabel("Crop Yield (tons per hectare)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# **Step 5: Prepare Data for Modeling**
X = df[["Soil_Quality", "Rainfall", "Temperature", "Fertilizer_Usage", "Crop_Type_Encoded"]]
y = df["Crop_Yield"]
X_train, X_test, y_train, y_test = __________(X, y, test_size=0.2, random_state=42)

In [None]:
# **Step 6: Train and Evaluate Multiple Models**
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.___(X_train, y_train)
    y_pred = model._____(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} Performance:")
    print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, R² Score: {r2:.2f}\n")
    return r2

# Train various ML models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regression": SVR(kernel='rbf'),
    "XGBoost": XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
}

# Store results for comparison
results = {}
for name, model in models.items():
    results[name] = __________(model, X_train, X_test, y_train, y_test, name)

In [None]:
# **Step 7: Compare Model Performance**
plt.figure(figsize=(8, 5))
plt._____(results.keys(), results.values(), color=['blue', 'green', 'red', 'purple', 'orange'])
plt.ylabel("R² Score")
plt.title("Model Comparison: Crop Yield Prediction")
plt.xticks(rotation=30)
plt.show()

# **Step 8: Feature Importance Analysis (Random Forest)**
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model._____(X_train, y_train)
importances = rf_model.______________
feature_names = X.columns

plt.figure(figsize=(8, 5))
sns.barplot(x=importances, y=feature_names)
plt.title("Feature Importance in Crop Yield Prediction")
plt.xlabel("Importance Score")
plt.show()

# **Conclusion:**
# We explored various machine learning models for crop yield prediction, compared their performance,
# and analyzed feature importance. Random Forest and XGBoost usually provide the best results
# due to their ability to handle nonlinear relationships. Further improvements can be made using
# deep learning techniques or more real-world data preprocessing.