In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Days_to_Harvest,Yield_tons_per_hectare,Region_North,Region_South,Region_West,Soil_Type_Clay,...,Soil_Type_Peaty,Soil_Type_Sandy,Soil_Type_Silt,Crop_Cotton,Crop_Maize,Crop_Rice,Crop_Soybean,Crop_Wheat,Weather_Condition_Rainy,Weather_Condition_Sunny
0,897.077239,27.676966,0,1,122,6.555816,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
1,992.673282,18.026142,1,1,140,8.527341,0,1,0,1,...,0,0,0,0,0,1,0,0,1,0
2,147.998025,29.794042,0,0,106,1.127443,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,986.866331,16.64419,0,1,146,6.517573,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,730.379174,31.620687,1,1,110,7.248251,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0


In [3]:
df.shape

(1000000, 21)

In [4]:
X = df.drop(columns=['Yield_tons_per_hectare'])
y = df['Yield_tons_per_hectare']

In [5]:
X.head()

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Days_to_Harvest,Region_North,Region_South,Region_West,Soil_Type_Clay,Soil_Type_Loam,Soil_Type_Peaty,Soil_Type_Sandy,Soil_Type_Silt,Crop_Cotton,Crop_Maize,Crop_Rice,Crop_Soybean,Crop_Wheat,Weather_Condition_Rainy,Weather_Condition_Sunny
0,897.077239,27.676966,0,1,122,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
1,992.673282,18.026142,1,1,140,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0
2,147.998025,29.794042,0,0,106,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,986.866331,16.64419,0,1,146,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0
4,730.379174,31.620687,1,1,110,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, y, train_size=0.8, random_state=30)

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
cols = ['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest']
x_train[cols] = sc.fit_transform(x_train[cols])
x_test[cols] = sc.transform(x_test[cols])

In [8]:
from sklearn.decomposition import PCA
pc = PCA(n_components=0.95)
x_train=pc.fit_transform(x_train)
x_test = pc.transform(x_test)

In [9]:
pc.explained_variance_ratio_ *100

array([17.00848933, 16.96407297, 16.90985052,  5.6610808 ,  4.25012071,
        4.24365763,  4.23671214,  4.23451725,  2.83684256,  2.8326173 ,
        2.82905352,  2.82832331,  2.82342761,  2.82072433,  2.81937403,
        2.81678698])

#### Model 1

In [10]:
#modeling
from sklearn.linear_model import LinearRegression
mod = LinearRegression()
mod.fit(x_train,y_train)
print('intercept :',mod.intercept_)
print('coefficients :',mod.coef_)

#predection
x_train_pred = mod.predict(x_train)
x_test_pred = mod.predict(x_test)

#evolution
print('Train R2 :',mod.score(x_train,y_train))
print('Test R2 :',mod.score(x_test,y_test))

#CV Score
from sklearn.model_selection import cross_val_score
cv = cross_val_score(mod,x_train,y_train,cv=5,scoring='r2')
print('CV Score :',cv.mean())

intercept : 4.649028852498379
coefficients : [ 9.05271853e-01 -2.80332625e-01  8.99181994e-01 -1.93410840e-03
  1.43401697e+00  5.98525605e-01  6.89125437e-01 -8.93683483e-01
  4.99791470e-03  3.85883747e-03 -4.32855609e-03  1.01551565e-02
 -7.38823546e-03 -8.79135660e-03  3.55437531e-04  8.42921425e-03]
Train R2 : 0.9131823420094353
Test R2 : 0.9123829895179535
CV Score : 0.9131775453300459


In [11]:
from sklearn.metrics import mean_squared_error
Train_MSE=mean_squared_error(y_train,x_train_pred)
Test_MSE=mean_squared_error(y_test,x_test_pred)
print('Train RMSE :',Train_MSE**0.5)
print('Test RMSE :',Test_MSE**0.5)
cv = cross_val_score(mod,x_train,y_train,cv=5,scoring='neg_mean_squared_error')
cv = abs(cv.mean())
print ('CV Score :',cv**0.5)

Train RMSE : 0.4999959050043898
Test RMSE : 0.5014029900365977
CV Score : 0.5000067901656589


#### Model 2

In [12]:
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)

In [13]:
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [14]:
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [15]:
import statsmodels.api as sm
X_train_sm = sm.add_constant(x_train)
ols_model = sm.OLS(y_train, X_train_sm).fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,Yield_tons_per_hectare,R-squared:,0.913
Model:,OLS,Adj. R-squared:,0.913
Method:,Least Squares,F-statistic:,525900.0
Date:,"Sun, 07 Dec 2025",Prob (F-statistic):,0.0
Time:,21:30:28,Log-Likelihood:,-580630.0
No. Observations:,800000,AIC:,1161000.0
Df Residuals:,799983,BIC:,1161000.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.6490,0.001,8316.415,0.000,4.648,4.650
0,0.9053,0.001,1621.674,0.000,0.904,0.906
1,-0.2803,0.001,-501.522,0.000,-0.281,-0.279
2,0.8992,0.001,1606.087,0.000,0.898,0.900
3,-0.0019,0.001,-1.999,0.046,-0.004,-3.76e-05
4,1.4340,0.001,1284.123,0.000,1.432,1.436
5,0.5985,0.001,535.556,0.000,0.596,0.601
6,0.6891,0.001,616.119,0.000,0.687,0.691
7,-0.8937,0.001,-798.799,0.000,-0.896,-0.891

0,1,2,3
Omnibus:,4.622,Durbin-Watson:,2.003
Prob(Omnibus):,0.099,Jarque-Bera (JB):,4.626
Skew:,0.005,Prob(JB):,0.099
Kurtosis:,2.995,Cond. No.,2.46


In [16]:
x_train = x_train.drop(columns=[14])
x_test = x_test.drop(columns=[14])

In [17]:
# modeling
from sklearn.linear_model import LinearRegression
mod = LinearRegression()
mod.fit(x_train,y_train)
print('intercept :',mod.intercept_)
print('coefficients :',mod.coef_)

#predection
x_train_pred = mod.predict(x_train)
x_test_pred = mod.predict(x_test)

#evolution
print('Train R2 :',mod.score(x_train,y_train))
print('Test R2 :',mod.score(x_test,y_test))

#CV Score
from sklearn.model_selection import cross_val_score
cv = cross_val_score(mod,x_train,y_train,cv=5,scoring='r2')
print('CV Score :',cv.mean())

intercept : 4.649028852498379
coefficients : [ 0.90527185 -0.28033262  0.89918199 -0.00193411  1.43401697  0.5985256
  0.68912544 -0.89368348  0.00499791  0.00385884 -0.00432856  0.01015516
 -0.00738824 -0.00879136  0.00842921]
Train R2 : 0.9131823347163759
Test R2 : 0.9123827053058121
CV Score : 0.9131777205207643


In [18]:
from sklearn.metrics import mean_squared_error
Train_MSE=mean_squared_error(y_train,x_train_pred)
Test_MSE=mean_squared_error(y_test,x_test_pred)
print('Train RMSE :',Train_MSE**0.5)
print('Test RMSE :',Test_MSE**0.5)
cv = cross_val_score(mod,x_train,y_train,cv=5,scoring='neg_mean_squared_error')
cv = abs(cv.mean())
print ('CV Score :',cv**0.5)

Train RMSE : 0.49999592600530046
Test RMSE : 0.5014038032616848
CV Score : 0.5000062846317986


#### model 3

In [19]:
X.head()

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Days_to_Harvest,Region_North,Region_South,Region_West,Soil_Type_Clay,Soil_Type_Loam,Soil_Type_Peaty,Soil_Type_Sandy,Soil_Type_Silt,Crop_Cotton,Crop_Maize,Crop_Rice,Crop_Soybean,Crop_Wheat,Weather_Condition_Rainy,Weather_Condition_Sunny
0,897.077239,27.676966,0,1,122,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
1,992.673282,18.026142,1,1,140,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0
2,147.998025,29.794042,0,0,106,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,986.866331,16.64419,0,1,146,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0
4,730.379174,31.620687,1,1,110,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [20]:
X = sm.add_constant(X)

In [21]:
import statsmodels.formula.api as smf
mod = sm.OLS(y,X).fit()
mod.summary()

0,1,2,3
Dep. Variable:,Yield_tons_per_hectare,R-squared:,0.913
Model:,OLS,Adj. R-squared:,0.913
Method:,Least Squares,F-statistic:,524900.0
Date:,"Sun, 07 Dec 2025",Prob (F-statistic):,0.0
Time:,21:30:36,Log-Likelihood:,-726340.0
No. Observations:,1000000,AIC:,1453000.0
Df Residuals:,999979,BIC:,1453000.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0020,0.004,0.535,0.592,-0.005,0.009
Rainfall_mm,0.0050,1.93e-06,2595.012,0.000,0.005,0.005
Temperature_Celsius,0.0199,6.93e-05,287.419,0.000,0.020,0.020
Fertilizer_Used,1.5002,0.001,1499.380,0.000,1.498,1.502
Irrigation_Used,1.1995,0.001,1198.784,0.000,1.198,1.201
Days_to_Harvest,2.664e-05,1.93e-05,1.382,0.167,-1.11e-05,6.44e-05
Region_North,0.0011,0.001,0.786,0.432,-0.002,0.004
Region_South,-0.0009,0.001,-0.667,0.505,-0.004,0.002
Region_West,-0.0012,0.001,-0.824,0.410,-0.004,0.002

0,1,2,3
Omnibus:,4.724,Durbin-Watson:,2.0
Prob(Omnibus):,0.094,Jarque-Bera (JB):,4.726
Skew:,0.005,Prob(JB):,0.0941
Kurtosis:,2.995,Cond. No.,5020.0


In [22]:
X=X.drop(columns=['Soil_Type_Loam'])

In [23]:
X=X.drop(columns=['Weather_Condition_Sunny'])

In [24]:
X=X.drop(columns=['Crop_Rice'])

In [25]:
X=X.drop(columns=['Soil_Type_Peaty'])

In [26]:
X=X.drop(columns=['Region_South'])

In [27]:
X=X.drop(columns=['Region_West'])

In [28]:
X=X.drop(columns=['Crop_Soybean'])

In [29]:
X=X.drop(columns=['Crop_Cotton'])

In [30]:
X=X.drop(columns=['Soil_Type_Silt'])

In [31]:
X=X.drop(columns=['Weather_Condition_Rainy'])

In [32]:
X=X.drop(columns=['Soil_Type_Sandy'])

In [33]:
X=X.drop(columns=['Days_to_Harvest'])

In [34]:
X=X.drop(columns=['Region_North'])

In [35]:
X=X.drop(columns=['Crop_Wheat'])

In [36]:
X=X.drop(columns=['Crop_Maize'])

In [37]:
X=X.drop(columns=['const'])

- Dropped these columns one by one due to high pvalue 
- Soil_Type_Loam, Weather_Condition_Sunny, Crop_Rice, Soil_Type_Peaty, Region_South, Region_West,
  Crop_Soybean, Crop_Cotton, Soil_Type_Silt, Weather_Condition_Rainy, Soil_Type_Sandy,    
  Days_to_Harvest, Region_North, Crop_Wheat, Crop_Maize

In [38]:
X.head()

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Soil_Type_Clay
0,897.077239,27.676966,0,1,0
1,992.673282,18.026142,1,1,1
2,147.998025,29.794042,0,0,0
3,986.866331,16.64419,0,1,0
4,730.379174,31.620687,1,1,0


In [39]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, y, train_size=0.8, random_state=30)

In [40]:
from sklearn.preprocessing import StandardScaler
sc3 = StandardScaler()
cols = ['Rainfall_mm', 'Temperature_Celsius']
x_train[cols] = sc3.fit_transform(x_train[cols])
x_test[cols] = sc3.transform(x_test[cols])

In [41]:
x_train

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Soil_Type_Clay
646066,-1.526039,-1.495336,0,1,0
893735,1.358207,1.348140,1,1,0
807846,-0.555765,0.799324,1,0,0
588218,-0.363370,0.841162,0,0,1
637848,1.697655,-1.484819,1,1,0
...,...,...,...,...,...
570508,-1.647213,1.718299,0,0,0
885236,0.011948,1.226514,1,1,0
572333,1.660983,-0.981827,0,1,1
987557,-0.884488,0.280025,0,1,0


In [41]:
#modeling
from sklearn.linear_model import LinearRegression
mod3 = LinearRegression()
mod3.fit(x_train,y_train)
print('intercept :',mod3.intercept_)
print('coefficients :',mod3.coef_)

#predection
x_train_pred = mod3.predict(x_train)
x_test_pred = mod3.predict(x_test)

#evolution
print('Train R2 :',mod3.score(x_train,y_train))
print('Test R2 :',mod3.score(x_test,y_test))

#CV Score
from sklearn.model_selection import cross_val_score
cv = cross_val_score(mod3,x_train,y_train,cv=5,scoring='r2')
print('CV Score :',cv.mean())

intercept : 3.299621906082698
coefficients : [1.29866694 0.14353098 1.50011958 1.199025   0.00164237]
Train R2 : 0.9131807118429467
Test R2 : 0.912384530119696
CV Score : 0.9131780474325053


In [42]:
from sklearn.metrics import mean_squared_error
Train_MSE=mean_squared_error(y_train,x_train_pred)
Test_MSE=mean_squared_error(y_test,x_test_pred)
print('Train RMSE :',Train_MSE**0.5)
print('Test RMSE :',Test_MSE**0.5)
cv = cross_val_score(mod3,x_train,y_train,cv=5,scoring='neg_mean_squared_error')
cv = abs(cv.mean())
print ('CV Score :',cv**0.5)

Train RMSE : 0.5000005991689267
Test RMSE : 0.5013985818417076
CV Score : 0.5000053465470925


### model 3 is best model comparing to other model 

In [43]:
# Saving the trained model
import joblib
joblib.dump(mod3, 'R_model.joblib')

['R_model.joblib']

In [44]:
joblib.dump(sc3, 'scaler.joblib')

['scaler.joblib']

In [89]:
new_data = pd.concat([X, y], axis=1)

In [90]:
new_data.head()

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Soil_Type_Clay,Yield_tons_per_hectare
0,897.077239,27.676966,0,1,0,6.555816
1,992.673282,18.026142,1,1,1,8.527341
2,147.998025,29.794042,0,0,0,1.127443
3,986.866331,16.64419,0,1,0,6.517573
4,730.379174,31.620687,1,1,0,7.248251


In [46]:
new_data.to_csv("final_dataset.csv", index=False)