In [13]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

In [14]:
df = pd.read_csv('final_dataset.csv')
df.head()

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Soil_Type_Clay,Yield_tons_per_hectare
0,897.077239,27.676966,0,1,0,6.555816
1,992.673282,18.026142,1,1,1,8.527341
2,147.998025,29.794042,0,0,0,1.127443
3,986.866331,16.64419,0,1,0,6.517573
4,730.379174,31.620687,1,1,0,7.248251


In [15]:
X = df.drop(columns=['Yield_tons_per_hectare'])
y = df['Yield_tons_per_hectare']

In [16]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, y, train_size=0.8, random_state=30)

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
cols = ['Rainfall_mm', 'Temperature_Celsius']
x_train[cols] = sc.fit_transform(x_train[cols])
x_test[cols] = sc.transform(x_test[cols])

In [18]:
#modeling
from sklearn.linear_model import Lasso
mod = Lasso()
mod.fit(x_train,y_train)
print('intercept :',mod.intercept_)
print('Coefficient :',mod.coef_)


#predection
x_train_pred = mod.predict(x_train)
x_test_pred = mod.predict(x_test)

#evolution
print('Train R2 :',mod.score(x_train,y_train))
print('Test R2 :',mod.score(x_test,y_test))

#CV Score
from sklearn.model_selection import cross_val_score
cv = cross_val_score(mod,x_train,y_train,cv=5,scoring='r2')
print('CV Score :',cv.mean())

intercept : 4.649028852498379
Coefficient : [ 0.29830729  0.          0.          0.         -0.        ]
Train R2 : 0.2380932772038289
Test R2 : 0.23755253339952687
CV Score : 0.23808483055484278


In [19]:
#Checking RMSE
from sklearn.metrics import mean_squared_error
Train_MSE=mean_squared_error(y_train,x_train_pred)
Test_MSE=mean_squared_error(y_test,x_test_pred)
print('Train RMSE :',Train_MSE**0.5)
print('Test RMSE :',Test_MSE**0.5)
cv = cross_val_score(mod,x_train,y_train,cv=5,scoring='neg_mean_squared_error')
cv = abs(cv.mean())
print ('CV Score :',cv**0.5)

Train RMSE : 1.481198683090463
Test RMSE : 1.4791004422361296
CV Score : 1.4812039295863642


In [22]:
# finding best alpha value
from sklearn.model_selection import GridSearchCV
est = Lasso()
listt = {'alpha':[0.001,0.01,0.01, 0.1, 1, 10]}
mod = GridSearchCV(est,listt,cv=5,scoring='neg_mean_squared_error')
mod.fit(x_train,y_train)
mod.best_params_

{'alpha': 1e-09}

In [23]:
x_train.head()

Unnamed: 0,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Soil_Type_Clay
646066,-1.526039,-1.495336,0,1,0
893735,1.358207,1.34814,1,1,0
807846,-0.555765,0.799324,1,0,0
588218,-0.36337,0.841162,0,0,1
637848,1.697655,-1.484819,1,1,0


In [24]:
x_train = x_train.drop(columns=['Temperature_Celsius','Fertilizer_Used','Irrigation_Used','Soil_Type_Clay'])
x_test = x_test.drop(columns=['Temperature_Celsius','Fertilizer_Used','Irrigation_Used','Soil_Type_Clay'])

In [27]:
from sklearn.linear_model import Lasso
mod = Lasso(alpha=0.001)
mod.fit(x_train,y_train)
print('intercept :',mod.intercept_)
print('Coefficient :',mod.coef_)


#predection
x_train_pred = mod.predict(x_train)
x_test_pred = mod.predict(x_test)

#evolution
print('Train R2 :',mod.score(x_train,y_train))
print('Test R2 :',mod.score(x_test,y_test))

#CV Score
from sklearn.model_selection import cross_val_score
cv = cross_val_score(mod,x_train,y_train,cv=5,scoring='r2')
print('CV Score :',cv.mean())

intercept : 4.649028852498379
Coefficient : [1.29730729]
Train R2 : 0.5853692502465024
Test R2 : 0.5819143747494858
CV Score : 0.5853585107618291


In [28]:
#Checking RMSE
from sklearn.metrics import mean_squared_error
Train_MSE=mean_squared_error(y_train,x_train_pred)
Test_MSE=mean_squared_error(y_test,x_test_pred)
print('Train RMSE :',Train_MSE**0.5)
print('Test RMSE :',Test_MSE**0.5)
cv = cross_val_score(mod,x_train,y_train,cv=5,scoring='neg_mean_squared_error')
cv = abs(cv.mean())
print ('CV Score :',cv**0.5)

Train RMSE : 1.0926804376344086
Test RMSE : 1.0952804148269397
CV Score : 1.0926871193984515
