In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'/Users/satyamjadhav/Base/Codebases/CropYieldPredtnANN/crop_yield.csv')
df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [3]:
df.isnull().sum()

Crop               0
Crop_Year          0
Season             0
State              0
Area               0
Production         0
Annual_Rainfall    0
Fertilizer         0
Pesticide          0
Yield              0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


In [5]:
df.duplicated().sum()

0

In [6]:
x = df.drop(['Yield'], axis = 1)
y = df[['Yield']]

In [7]:
print(x.shape)
print(y.shape)

(19689, 9)
(19689, 1)


In [8]:
x.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09


In [9]:
y.head()

Unnamed: 0,Yield
0,0.796087
1,0.710435
2,0.238333
3,5238.051739
4,0.420909


In [10]:
# Identify categorical and numerical columns
categorical_cols = ['Crop', 'Season', 'State']
numerical_cols = ['Crop_Year','Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']

In [11]:
# Define the preprocessing steps for the pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('power', PowerTransformer(method='yeo-johnson')),
        ]), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

In [12]:
# Create a pipeline with preprocessor and model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [13]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [14]:
# Fit the model
model_pipeline.fit(x_train, y_train)

In [15]:
# # Evaluate the model
score = model_pipeline.score(x_test, y_test)
print(f'Model R^2 Score: {score:.2f}')

Model R^2 Score: 0.99


In [16]:
# Make predictions on training and testing sets
y_train_pred = model_pipeline.predict(x_train)
y_test_pred = model_pipeline.predict(x_test)

In [17]:
# Calculate and print metrics for training set
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

In [18]:
print("Training Set Metrics:")
print(f"R^2 Score: {train_r2:.2f}")
print(f"Mean Absolute Error (MAE): {train_mae:.2f}")
print(f"Mean Squared Error (MSE): {train_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {train_rmse:.2f}")

Training Set Metrics:
R^2 Score: 1.00
Mean Absolute Error (MAE): 2.85
Mean Squared Error (MSE): 2316.85
Root Mean Squared Error (RMSE): 48.13


In [19]:
# Calculate and print metrics for testing set
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)

In [20]:
y_pred_train= model_pipeline.predict(x_train)
y_pred_test = model_pipeline.predict(x_test)

In [21]:
print("Training Accuracy : ",r2_score(y_train, y_pred_train))
print("Test Accuracy : ",r2_score(y_test, y_pred_test))
print("Model saved successfully!")

Training Accuracy :  0.9969671413003895
Test Accuracy :  0.9882693919315849
Model saved successfully!


In [22]:
print("\nTesting Set Metrics:")
print(f"R^2 Score: {test_r2:.2f}")
print(f"Mean Absolute Error (MAE): {test_mae:.2f}")
print(f"Mean Squared Error (MSE): {test_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {test_rmse:.2f}")


Testing Set Metrics:
R^2 Score: 0.99
Mean Absolute Error (MAE): 7.11
Mean Squared Error (MSE): 9399.03
Root Mean Squared Error (RMSE): 96.95


In [23]:
# Save the model to a .pkl file
# joblib.dump(model_pipeline, 'GPT_crop_yield_model2.pkl')
# print("Model saved successfully as 'GPT_crop_yield_model2.pkl'.")
pickle.dump(model_pipeline, open("model.pkl","wb"))

In [24]:
# Example prediction
def predict_yield(crop_year, area, production, rainfall, fertilizer, pesticide, crop, state, season):
    input_data = pd.DataFrame({
        'Crop_Year': [crop_year],
        'Area': [area],
        'Production': [production],
        'Annual_Rainfall': [rainfall],
        'Fertilizer': [fertilizer],
        'Pesticide': [pesticide],
        'Crop': [crop],
        'Season': [season],
        'State': [state]
    })
    predicted_yield = model_pipeline.predict(input_data)
    return predicted_yield[0]

In [25]:
# Example usage of the prediction function
predicted_yield = predict_yield(2018,5000, 3000, 800, 20000, 1500, 'Bajra', 'Kharif', 'Punjab')
print(f"Predicted Crop Yield: {predicted_yield:.2f}")

Predicted Crop Yield: 1.10


In [26]:
#Other  Rabi pulses,2016,Rabi       ,Jammu and Kashmir,548,297,902.8,83981,191.8,0.796666667
predicted_yield = predict_yield(2016,548,297,902.8,83981,191.8,'Other  Rabi pulses','Rabi','Jammu and Kashmir')
print(f"Predicted Crop Yield: {predicted_yield:.2f}")

Predicted Crop Yield: 0.56


In [27]:
# Arecanut,1997,Whole Year ,Assam,73814,56708,2051.4,7024878.38,22882.34,0.796086957
predicted_yield = predict_yield(1997,73814,56708,2051.4,7024878.38,22882.34,'Arecanut','Whole Year','Assam')
print(f"Predicted Crop Yield: {predicted_yield:.2f}")

Predicted Crop Yield: 0.83
