In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Getting dataset

In [4]:
dataset = pd.read_csv("data\\crop_production.csv")

Checking for null values

In [5]:
dataset.isnull().sum()

State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [6]:
dataset.dropna(inplace = True) #Removing null values from the dataset
dataset = dataset[dataset['Production'] != 0] #Removing values where production is zero

In [7]:
dataset = pd.get_dummies(data = dataset) #Converting categorical data to dummy variables

In [8]:
dataset.head() #Display first 5 rows of dataset

Unnamed: 0,Crop_Year,Area,Production,State_Name_Andaman and Nicobar Islands,State_Name_Andhra Pradesh,State_Name_Arunachal Pradesh,State_Name_Assam,State_Name_Bihar,State_Name_Chandigarh,State_Name_Chhattisgarh,...,Crop_Tobacco,Crop_Tomato,Crop_Total foodgrain,Crop_Turmeric,Crop_Turnip,Crop_Urad,Crop_Varagu,Crop_Wheat,Crop_other misc. pulses,Crop_other oilseeds
0,2000,1254.0,2000.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,2.0,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2000,102.0,321.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2000,176.0,641.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2000,720.0,165.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Getting feature matrix and dependant variable

In [9]:
x = dataset.drop('Production', axis = 1)
y = dataset['Production']

Splitting data into training set and test set

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

Implementing Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

Predicting based on test data

In [10]:
y_pred = regressor.predict(x_test)

In [11]:
print(y_pred)

[  167186.21896808  -418653.1552122    110302.99546201 ...
 -1615564.26792578 -1049825.15177098  -788370.15529162]


Checking the R2 score

In [12]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.1800063413160291


R2 score with Linear Regression is 0.18

Predicting based on single data point

In [13]:
new_data = pd.DataFrame({'State_Name' : ['Maharashtra'], 'District_Name' : ['Pune'], 'Crop_Year' : [2024], 'Season' : ['Kharif'], 'Crop' : ['Rice'], 'Area' : [1000]})
categorical_columns = ['State_Name', 'District_Name', 'Season', 'Crop']
new_data = pd.get_dummies(new_data, columns = categorical_columns)
print(new_data)

for col in x_train.columns:
    if col not in new_data.columns:
        new_data[col] = 0

new_data = new_data.reindex(columns=x_train.columns, fill_value=0)

regressor.predict(new_data)

   Crop_Year  Area  State_Name_Maharashtra  District_Name_Pune  Season_Kharif  \
0       2024  1000                       1                   1              1   

   Crop_Rice  
0          1  


  new_data[col] = 0


array([-1515530.05656775])

Implementing with Random Forest Regression

In [14]:
from sklearn.ensemble import RandomForestRegressor
regressor_rfr = RandomForestRegressor()
regressor_rfr.fit(x_train, y_train)

In [None]:
y_pred_rfr = regressor_rfr.predict(x_test)

In [None]:
print(y_pred_rfr)

[1095.956  233.56  1076.2   ...    2.     662.4     42.37 ]


R2 score for random forest regressor

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred_rfr))

0.9551542231044067


R2 Score with Random Forest Regressor is 0.95

Saving Random Forest Regression Model

In [None]:
import joblib

joblib.dump(regressor_rfr, 'crop_rfr_model.pkl')

['C:\\Users\\siddh\\OneDrive\\Desktop\\ML\\Project\\crop_production.csv\\crop_rfr_model.pkl']

Loading saved Model

In [4]:
import joblib

regressor_rfr = joblib.load('crop_rfr_model.pkl')

In [None]:
y_pred_n = regressor_rfr.predict(x_test)

In [None]:
print(y_pred_n)

[1095.956  233.56  1076.2   ...    2.     662.4     42.37 ]


Predicting single datapoint

In [None]:
new_data = pd.DataFrame({'State_Name' : ['Maharashtra'], 'District_Name' : ['Ratnagiri'], 'Crop_Year' : [2030], 'Season' : ['Kharif'], 'Crop' : ['Mango'], 'Area' : [1500]})
categorical_columns = ['State_Name', 'District_Name', 'Season', 'Crop']
new_data = pd.get_dummies(new_data, columns = categorical_columns)
print(new_data)

for col in x_train.columns:
    if col not in new_data.columns:
        new_data[col] = 0

new_data = new_data.reindex(columns=x_train.columns, fill_value=0)

regressor_rfr.predict(new_data)

   Crop_Year  Area  State_Name_Maharashtra  District_Name_Ratnagiri  \
0       2030  1500                       1                        1   

   Season_Kharif  Crop_Mango  
0              1           1  


  new_data[col] = 0


array([3276.92])