In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
import matplotlib.pyplot as plt
import seaborn as sns

In [79]:
# Load the dataset
data = pd.read_csv("crop_yield.csv")
data.describe()

Unnamed: 0,Crop_Year,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
count,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0
mean,2009.127584,179926.6,16435940.0,1437.755177,24103310.0,48848.35,79.954009
std,6.498099,732828.7,263056800.0,816.909589,94946000.0,213287.4,878.306193
min,1997.0,0.5,0.0,301.3,54.17,0.09,0.0
25%,2004.0,1390.0,1393.0,940.7,188014.6,356.7,0.6
50%,2010.0,9317.0,13804.0,1247.6,1234957.0,2421.9,1.03
75%,2015.0,75112.0,122718.0,1643.7,10003850.0,20041.7,2.388889
max,2020.0,50808100.0,6326000000.0,6552.7,4835407000.0,15750510.0,21105.0


In [53]:
#correlation_matrix = data.corr()
#plt.figure(figsize=(10, 6))
#sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
#plt.title("Correlation Matrix")
#plt.show()

In [54]:
# Sample a subset of the data
#sampled_data = data.sample(n=1000)
# Create the pair plot for the sampled data
#sns.pairplot(data=sampled_data, vars=["Annual_Rainfall", "Fertilizer", "Pesticide", "Yield"])
#plt.title("Pairplot of Numerical Features (Sampled Data)")
#plt.show()

In [55]:
#plt.figure(figsize=(12, 6))
#sns.countplot(data=data, x="Crop")
#plt.xticks(rotation=90)
#plt.title("Cropyield")
#plt.xlabel("Crop")
#plt.ylabel("Count")
#plt.show()

In [56]:
#plt.figure(figsize=(10, 6))
#sns.boxplot(data=data, x="Season", y="Yield")
#plt.title("Yield by Season")
#plt.xlabel("Season")
#plt.ylabel("Yield")
#plt.show()

In [57]:
#plt.figure(figsize=(10, 6))
#sns.scatterplot(data=data, x="Annual_Rainfall", y="Yield")
#plt.title("Annual Rainfall vs. Yield")
#plt.xlabel("Annual Rainfall")
#plt.ylabel("Yield")
#plt.show()

In [75]:
# Encode categorical variables (e.g., one-hot encoding)
data = pd.get_dummies(data, columns=["Crop", "Season", "State"])

In [76]:
# Split the data into features (X) and target variable (y)
X = data.drop("Yield", axis=1)
y = data["Yield"]

In [77]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
# Standardize the features
scaler = StandardScaler()
X_train["Annual_Rainfall"] = scaler.fit_transform(X_train["Annual_Rainfall"].values.reshape(-1, 1))
X_test["Annual_Rainfall"] = scaler.transform(X_test["Annual_Rainfall"].values.reshape(-1, 1))

In [82]:
# Create individual regression models
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
xgboost = XGBRegressor(learning_rate=0.1, n_estimators=100, random_state=42)

In [83]:
# Create a VotingRegressor that combines the three models
ensemble_model = VotingRegressor(estimators=[('RandomForest', random_forest), ('XGBoost', xgboost)])

In [84]:
# Train the ensemble model
ensemble_model.fit(X_train, y_train)

In [65]:
# Make predictions using the ensemble model
predictions = ensemble_model.predict(X_test)

In [85]:
# Evaluate the ensemble model's performance (e.g., calculate RMSE or R2 score)
from sklearn.metrics import r2_score
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)

R-squared: 0.9909246012063686


User input

In [67]:
# Function to preprocess user input and make predictions
def predict_yield(user_input):
    # Encode categorical variables in user input
    user_input_encoded = pd.get_dummies(user_input, columns=["Crop", "Season", "State"])
    
    # Ensure that the user input has the same set of features as the training data
    # This may involve adding missing columns with zeros
    missing_columns = set(X.columns) - set(user_input_encoded.columns)
    for column in missing_columns:
        user_input_encoded[column] = 0
    
    # Reorder columns to match the order during training
    user_input_encoded = user_input_encoded[X.columns]
    
    # Scale Annual_Rainfall using the same scaler as in training
    user_input_encoded["Annual_Rainfall"] = scaler.transform(user_input_encoded["Annual_Rainfall"].values.reshape(-1, 1))
    
    # Use the ensemble model to make predictions for user input
    user_predictions = ensemble_model.predict(user_input_encoded)
    
    return user_predictions

In [68]:
user_input = {
    "Crop": "Potato",
    "Crop_Year": 2022,
    "Season": "Whole Year",
    "State": "Assam",
    "Annual_Rainfall": 2050.0
}

In [69]:
# Convert the user input dictionary into a DataFrame
user_input_df = pd.DataFrame([user_input])

In [70]:
# Make predictions for the user input
user_yield_predictions = predict_yield(user_input_df)

In [71]:
print("Predicted Yield:", user_yield_predictions[0])

Predicted Yield: 5.734729830408922
