In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

In [2]:
# Load the dataset
data = pd.read_csv("crop_yield.csv")

In [3]:
# Select relevant columns
selected_columns = ["Crop", "Crop_Year", "Season", "State", "Annual_Rainfall", "Yield"]
data = data[selected_columns]

In [4]:
# Encode categorical variables (e.g., one-hot encoding)
data = pd.get_dummies(data, columns=["Crop", "Season", "State"])

In [5]:
# Split the data into features (X) and target variable (y)
X = data.drop("Yield", axis=1)
y = data["Yield"]

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Standardize the features
scaler = StandardScaler()
X_train["Annual_Rainfall"] = scaler.fit_transform(X_train["Annual_Rainfall"].values.reshape(-1, 1))
X_test["Annual_Rainfall"] = scaler.transform(X_test["Annual_Rainfall"].values.reshape(-1, 1))

In [8]:
# Create individual regression models
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
xgboost = XGBRegressor(learning_rate=0.1, n_estimators=100, random_state=42)

In [9]:
# Create a VotingRegressor that combines the three models
ensemble_model = VotingRegressor(estimators=[('RandomForest', random_forest), ('XGBoost', xgboost)])

In [10]:
# Train the ensemble model
ensemble_model.fit(X_train, y_train)

In [11]:
# Make predictions using the ensemble model
predictions = ensemble_model.predict(X_test)

In [12]:
# Evaluate the ensemble model's performance (e.g., calculate RMSE or R2 score)
from sklearn.metrics import r2_score
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)

R-squared: 0.9747497999227719


User input

In [17]:
# Function to preprocess user input and make predictions
def predict_yield(user_input):
    # Encode categorical variables in user input
    user_input_encoded = pd.get_dummies(user_input, columns=["Crop", "Season", "State"])
    
    # Ensure that the user input has the same set of features as the training data
    # This may involve adding missing columns with zeros
    missing_columns = set(X.columns) - set(user_input_encoded.columns)
    for column in missing_columns:
        user_input_encoded[column] = 0
    
    # Reorder columns to match the order during training
    user_input_encoded = user_input_encoded[X.columns]
    
    # Scale Annual_Rainfall using the same scaler as in training
    user_input_encoded["Annual_Rainfall"] = scaler.transform(user_input_encoded["Annual_Rainfall"].values.reshape(-1, 1))
    
    # Use the ensemble model to make predictions for user input
    user_predictions = ensemble_model.predict(user_input_encoded)
    
    return user_predictions

In [18]:
user_input = {
    "Crop": "Potato",
    "Crop_Year": 2022,
    "Season": "Whole Year",
    "State": "Assam",
    "Annual_Rainfall": 2050.0
}

In [19]:
# Convert the user input dictionary into a DataFrame
user_input_df = pd.DataFrame([user_input])

In [20]:
# Make predictions for the user input
user_yield_predictions = predict_yield(user_input_df)

In [21]:
print("Predicted Yield:", user_yield_predictions[0])

Predicted Yield: 8.210484076567294
