# ðŸŒ¾ Crop Yield Prediction using Regression Models
**Models Used:** Random Forest & Extra Trees Regressors

**Features:** Categorical + Numerical with ColumnTransformer

âœ… Drops `Production` column (not needed since `Yield` is the prediction target)
âœ… Applies `OneHotEncoder` on categorical columns


In [None]:

# 1. Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib


In [None]:
df = pd.read_csv("crop_yield_extended.csv")
print("Dataset shape:", df.shape)
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df['State'].unique()

In [None]:
df['Production'].max()

In [None]:
df['Crop'].unique()

In [None]:
df['Season'].unique()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.scatterplot(x = df['Annual_Rainfall'], y = df['Yield'])
plt.show()

In [None]:
df_year = df[df['Crop_Year']!=2020] # As the data of 2020 is incomplete

year_yield = df_year.groupby('Crop_Year').sum()

plt.figure(figsize = (12,5))
plt.plot(year_yield.index, year_yield['Yield'],color='blue', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='yellow')
plt.xlabel('Year')
plt.ylabel('Yield')
plt.title('Measure of Yield over the year')
plt.show()

In [None]:
plt.figure(figsize = (12,3))
plt.plot(year_yield.index, year_yield['Area'],color='blue', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='red')
plt.xlabel('Year')
plt.ylabel('Area')
plt.title('Area under cultivation over the year')
plt.show()

In [None]:
plt.figure(figsize = (12,3))
plt.plot(year_yield.index, year_yield['Fertilizer'],color='blue', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='green')
plt.xlabel('Year')
plt.ylabel('Fertilizer')
plt.title('Use of Fertilizer over the year')
plt.show()

In [None]:
plt.figure(figsize = (12,3))
plt.plot(year_yield.index, year_yield['Pesticide'],color='red', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='cyan')
plt.xlabel('Year')
plt.ylabel('Pesticide')
plt.title('Use of Pesticide over the Year')
plt.show()

Statewise Analysis:

In [None]:
import plotly.express as px

df_state = df.groupby('State').sum()
df_state.sort_values(by = 'Yield', inplace=True, ascending = False)

df_state['Region'] = ['States' for i in range(len(df_state))]

fig = px.bar(df_state, x='Region', y = 'Yield', color=df_state.index, hover_data=['Yield'])
fig.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize = (15,8))
sns.barplot(data = df_state, x=df_state.index, y=df_state['Annual_Rainfall'], palette = 'gnuplot', hue=df_state.index)
plt.xticks(rotation = 45)
plt.show()

In [None]:
#Bar chart of each year by annual rain fall


# Import necessary libraries
import matplotlib.pyplot as plt   # For plotting charts like bar plots

# Create a figure with size: 15 inches wide, 6 inches tall
plt.figure(figsize=(15, 6))

# Create a list of years from 1997 to 2020 (inclusive) for x-axis ticks
years = list(range(1997, 2021))

# Create a bar chart with Crop_Year on the x-axis and Annual_Rainfall on the y-axis
plt.bar(df['Crop_Year'], df['Annual_Rainfall'], color="skyblue")

# Set the x-axis label
plt.xlabel('Crop_Year')

# Set the x-axis ticks to show each year from 1997 to 2020
plt.xticks(years)

# Set the y-axis label
plt.ylabel('Annual_Rainfall')

# Set the title of the chart
plt.title('Crop_Year vs Annual_Rainfall')

# Show the final plot
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.scatterplot(x=df_state.index, y = df_state['Annual_Rainfall'], palette='rainbow', hue = df_state['Yield'])
plt.xticks(rotation=45)
plt.title('Annual Rainfall across the States')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x=df_state.index, y=df_state['Fertilizer'], palette='spring', hue = df_state['Yield'])
plt.xticks(rotation=90)
plt.title('Use of Fertilizer in Different States')
plt.show()

Seasonwise Analysis:

In [None]:
df_Seas = df[df['Season']!='Whole Year ']

df_season = df_Seas.groupby('Season').sum()

fig = px.bar(df_season, y = 'Area', color=df_season.index, hover_data=['Area'],text = 'Area')
fig.show()

Crop wise Analysis:

In [None]:
df_yz = df[df['Yield']==0]  # the Yield is zero
print(df_yz.shape)

plt.figure(figsize = (25,15))
sns.catplot(y="State", x="Crop",data=df_yz,hue="Crop", aspect = 3, palette ='inferno')
plt.xticks(rotation=90)
plt.title('States and the Crops where yield is zero')
plt.show()

In [None]:
df_ynz = df[df['Yield']>0]  # where yield is more than zero
df_crop = df_ynz.groupby('Crop').sum()

plt.figure(figsize = (25,8))
plt.plot(df_crop.index, df_crop['Fertilizer'],color='red', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='cyan')
plt.xlabel('Crops')
plt.ylabel('Fertilizer')
plt.title(' Use of Fertilizer in different Crops')
plt.xticks(rotation=30)
plt.show()

In [None]:
plt.figure(figsize = (25,8))
plt.plot(df_crop.index, df_crop['Area'],color='indigo', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='fuchsia')
plt.xlabel('Crops')
plt.ylabel('Area under cultivation')
plt.xticks(rotation=90)
plt.show()

In [None]:
df_wheat = df[df['Crop']=='Wheat']
df_wheat.reset_index(drop=True,inplace=True)

df_wheat1 = df_wheat[df_wheat['Crop_Year']!=2020]
df_wheat_year = df_wheat1.groupby('Crop_Year').sum()

plt.figure(figsize = (12,5))
plt.plot(df_wheat_year.index, df_wheat_year['Yield'],color='red', linestyle='dashed', marker='o',
        markersize=12, markerfacecolor='blue')
plt.xlabel('Year')
plt.ylabel('Yield')
plt.title('Yield of Wheat Crop over the Years')
plt.show()

In [None]:
df1 = df.copy()
df1 = df1.drop(['Crop_Year','Pesticide'], axis = 1)

# To check the distribution of dataset
plt.figure(figsize=(15,20))
plt.subplot(4,2,1)
sns.histplot(df1['Area'],bins = 20,color = 'red')
plt.subplot(4,2,2)
sns.histplot(df1['Production'],bins = 10,color = 'green')
plt.subplot(4,2,3)
sns.histplot(df1['Annual_Rainfall'],bins = 10,color = 'blue')
plt.subplot(4,2,4)
sns.histplot(df1['Fertilizer'],bins = 10, color = 'black')
plt.show()

In [None]:
for col in df.columns[:-1]:
    if df[col].dtypes != 'object':
        sns.scatterplot(x=col,y='Yield',data=df,hue='Season')
        plt.title(f'Distribution of {col} across Season')
        plt.show()

In [None]:
# Creating pie chart
# Create a pie chart to show the distribution of crops by season

# df['Season'].value_counts() â†’ gets the count of records for each season
# labels=df['Season'].unique() â†’ sets the labels for the slices (season names)
# autopct='%1.01f%%' â†’ shows percentage values on the chart (e.g., 12.3%)
# startangle=140 â†’ rotates the start of the pie chart for better visual appeal
# shadow=True â†’ adds a shadow effect for 3D-like appearance
# explode=(0.1,0.1,0.1,0.1,0.1,0.1) â†’ separates each slice a little for emphasis
plt.pie(
    df['Season'].value_counts(),
    labels=df['Season'].unique(),
    autopct='%1.01f%%',
    startangle=140,
    shadow=True,
    explode=(0.1, 0.1, 0.1, 0.1, 0.1, 0.1)  # Add some spacing between all 6 slices
)

# Add a title to the chart
plt.title('Crop Distribution by Season')

# Display the chart
plt.show()

## Outlier detection

In [None]:
def handle_outliers_iqr(data, column):
    """Remove outliers using the IQR method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    print(f"\nOutliers in {column}: {(data[column] < lower).sum() + (data[column] > upper).sum()}")
    print(f"Lower bound: {lower}, Upper bound: {upper}")
    
    # Visualize before removal
    plt.figure(figsize=(6,4))
    sns.boxplot(x=data[column])
    plt.title(f"Before removing outliers - {column}")
    plt.show()
    
    # Remove outliers
    data = data[(data[column] >= lower) & (data[column] <= upper)]
    
    # Visualize after removal
    plt.figure(figsize=(6,4))
    sns.boxplot(x=data[column])
    plt.title(f"After removing outliers - {column}")
    plt.show()
    
    return data

# Apply to numeric columns
for col in ['Production','Fertilizer','Annual_Rainfall','Area','Pesticide']:
    df = handle_outliers_iqr(df, col)

print("\nNew Shape after outlier removal:", df.shape) 


Feature engineering:

In [None]:
df['Fertilizer_per_area'] = df['Fertilizer'] / (df['Area'] + 1)
df['Pesticide_per_area'] = df['Pesticide'] / (df['Area'] + 1)
df['Production_log'] = np.log1p(df['Production'])

In [None]:
category_columns = df1.select_dtypes(include = ['object']).columns
category_columns

In [None]:
df1 = pd.get_dummies(df1, columns = category_columns, drop_first=True)

In [None]:
df1.head()

TRAIN TEST SPLIT

In [None]:
y = df1[['Yield']]#output
x = df1.drop(['Yield'], axis = 1)#inputs

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)#0,1,42

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='yeo-johnson')

# Fit only on training data
x_train_transform1 = pt.fit_transform(x_train)

# Apply the same transformation on test data
x_test_transform1 = pt.transform(x_test)


In [None]:
# Save the fitted PowerTransformer as a joblib file
joblib.dump(pt, "models/power_transformer_crop_yield.joblib")

In [None]:
df_trans = pd.DataFrame(x_train_transform1, columns=x_train.columns)

In [None]:
# After Transformation, there is no need for Standardization of the data

plt.figure(figsize=(15,20))
plt.subplot(4,2,1)
sns.histplot(df_trans['Area'],bins = 20,color = 'red')
plt.subplot(4,2,2)
sns.histplot(df_trans['Production'],bins = 10,color = 'green')
plt.subplot(4,2,3)
sns.histplot(df_trans['Annual_Rainfall'],bins = 10,color = 'fuchsia')
plt.subplot(4,2,4)
sns.histplot(df_trans['Fertilizer'],bins = 10, color = 'indigo')

plt.show()

Modelling:

In [None]:
# to store accuracy value
train_accu = []
test_accu = []

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()

regr.fit(x_train_transform1, y_train)

y_pred_train_regr= regr.predict(x_train_transform1)
y_pred_test_regr = regr.predict(x_test_transform1)

print("Training Accuracy : ",r2_score(y_train, y_pred_train_regr))
print("Test Accuracy : ",r2_score(y_test, y_pred_test_regr))

train_accu.append(r2_score(y_train,y_pred_train_regr))
test_accu.append(r2_score(y_test,y_pred_test_regr))

In [None]:
# Save the trained RandomForestRegressor model as a joblib file
joblib.dump(regr, "models/random_forest_crop_yield.joblib")

In [None]:
# After you create x (your training features DataFrame)
import joblib
joblib.dump(list(x.columns), "models/crop_yield_feature_names.joblib")