In [3]:
import pandas as pd

# Load the data
df = pd.read_csv('crop_yield.csv')

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [4]:
# Convert categorical variables to numerical values
df = pd.get_dummies(df, columns=['Crop', 'Season', 'State'], drop_first=True)

# Define features and target variable
X = df.drop(columns=['Yield'])
y = df['Yield']

# Display the first few rows of the processed features
X.head()


Unnamed: 0,Crop_Year,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Crop_Arhar/Tur,Crop_Bajra,Crop_Banana,Crop_Barley,...,State_Odisha,State_Puducherry,State_Punjab,State_Sikkim,State_Tamil Nadu,State_Telangana,State_Tripura,State_Uttar Pradesh,State_Uttarakhand,State_West Bengal
0,1997,73814.0,56708,2051.4,7024878.38,22882.34,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1997,6637.0,4685,2051.4,631643.29,2057.47,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1997,796.0,22,2051.4,75755.32,246.76,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1997,19656.0,126905000,2051.4,1870661.52,6093.36,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1997,1739.0,794,2051.4,165500.63,539.09,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


((15751, 94), (3938, 94), (15751,), (3938,))

In [6]:
from sklearn.preprocessing import StandardScaler

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the first few rows of the scaled features
pd.DataFrame(X_train_scaled).head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,93
0,0.294392,-0.2394,-0.063016,2.338277,-0.249961,-0.223506,-0.161629,-0.166525,-0.110204,-0.125699,...,-0.199345,-0.186237,-0.141458,-0.111086,-0.210283,-0.145149,-0.151629,-0.211269,-0.202084,-0.241057
1,-0.320896,-0.239772,-0.063019,-0.928899,-0.250474,-0.223985,-0.161629,-0.166525,-0.110204,7.955488,...,-0.199345,-0.186237,-0.141458,-0.111086,-0.210283,-0.145149,-0.151629,-0.211269,-0.202084,-0.241057
2,-1.551473,-0.239601,-0.063019,-0.676275,-0.250354,-0.22379,-0.161629,-0.166525,-0.110204,-0.125699,...,-0.199345,-0.186237,-0.141458,-0.111086,-0.210283,-0.145149,-0.151629,4.733303,-0.202084,-0.241057
3,0.755858,-0.212104,-0.062924,-0.180597,-0.217836,-0.192572,-0.161629,-0.166525,-0.110204,7.955488,...,-0.199345,-0.186237,-0.141458,-0.111086,-0.210283,-0.145149,-0.151629,-0.211269,4.94844,-0.241057
4,-1.090007,0.488643,-0.061559,-0.777619,0.288132,0.401437,-0.161629,-0.166525,-0.110204,-0.125699,...,-0.199345,-0.186237,-0.141458,-0.111086,-0.210283,-0.145149,-0.151629,-0.211269,-0.202084,-0.241057


In [7]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Check the training completion
print("Model trained successfully.")


Model trained successfully.


In [8]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Display the first few predictions
pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).head()


Unnamed: 0,Actual,Predicted
18238,3.82,3.728489
6918,1.395357,1.374714
4894,6.522727,6.575779
10960,0.812857,0.960149
15615,0.508,0.557231


In [10]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Example output interpretation
if r2 > 0.8:
    print("The model explains a significant portion of the variance in crop yields.")
elif r2 > 0.5:
    print("The model explains a moderate portion of the variance in crop yields.")
else:
    print("The model explains a small portion of the variance in crop yields.")


Mean Squared Error: 9860.629910453108
R-squared: 0.9876932859405553
The model explains a significant portion of the variance in crop yields.


#The purpose of this workflow is to predict crop yields using a RandomForestRegressor model. To achieve this, we started by loading and preprocessing the dataset, including encoding categorical variables and splitting the data into training and testing sets. We then standardized the features to ensure uniform scaling. The model was trained on the scaled training data, and its performance was assessed by making predictions on the test set. Finally, we evaluated the model using Mean Squared Error (MSE) and R-squared (R²) metrics to determine how effectively it predicts crop yields and explains the variance in the data.