In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PowerTransformer,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score


In [59]:
df = pd.read_csv('crop_yield.csv')

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Crop_Year,19689.0,2009.128,6.498099,1997.0,2004.0,2010.0,2015.0,2020.0
Area,19689.0,179926.6,732828.7,0.5,1390.0,9317.0,75112.0,50808100.0
Production,19689.0,16435940.0,263056800.0,0.0,1393.0,13804.0,122718.0,6326000000.0
Annual_Rainfall,19689.0,1437.755,816.9096,301.3,940.7,1247.6,1643.7,6552.7
Fertilizer,19689.0,24103310.0,94946000.0,54.17,188014.62,1234957.44,10003850.0,4835407000.0
Pesticide,19689.0,48848.35,213287.4,0.09,356.7,2421.9,20041.7,15750510.0
Yield,19689.0,79.95401,878.3062,0.0,0.6,1.03,2.388889,21105.0


In [60]:
category_columns = df.select_dtypes(include=['object']).columns
label_encoder_mapping = {}

for col in category_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoder_mapping[col] = dict(zip(le.classes_, le.transform(le.classes_)))

#print(label_encoder_mapping)

for i in label_encoder_mapping:
    print(i)
    print(label_encoder_mapping[i])
    print()

Crop
{'Arecanut': 0, 'Arhar/Tur': 1, 'Bajra': 2, 'Banana': 3, 'Barley': 4, 'Black pepper': 5, 'Cardamom': 6, 'Cashewnut': 7, 'Castor seed': 8, 'Coconut ': 9, 'Coriander': 10, 'Cotton(lint)': 11, 'Cowpea(Lobia)': 12, 'Dry chillies': 13, 'Garlic': 14, 'Ginger': 15, 'Gram': 16, 'Groundnut': 17, 'Guar seed': 18, 'Horse-gram': 19, 'Jowar': 20, 'Jute': 21, 'Khesari': 22, 'Linseed': 23, 'Maize': 24, 'Masoor': 25, 'Mesta': 26, 'Moong(Green Gram)': 27, 'Moth': 28, 'Niger seed': 29, 'Oilseeds total': 30, 'Onion': 31, 'Other  Rabi pulses': 32, 'Other Cereals': 33, 'Other Kharif pulses': 34, 'Other Summer Pulses': 35, 'Peas & beans (Pulses)': 36, 'Potato': 37, 'Ragi': 38, 'Rapeseed &Mustard': 39, 'Rice': 40, 'Safflower': 41, 'Sannhamp': 42, 'Sesamum': 43, 'Small millets': 44, 'Soyabean': 45, 'Sugarcane': 46, 'Sunflower': 47, 'Sweet potato': 48, 'Tapioca': 49, 'Tobacco': 50, 'Turmeric': 51, 'Urad': 52, 'Wheat': 53, 'other oilseeds': 54}

Season
{'Autumn     ': 0, 'Kharif     ': 1, 'Rabi       ': 2,

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  int32  
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  int32  
 3   State            19689 non-null  int32  
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int32(3), int64(2)
memory usage: 1.3 MB


In [62]:
X = df.drop(['Yield','Crop_Year'], axis=1)
y = df['Yield']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
pt = PowerTransformer(method='yeo-johnson')

x_train_transform = pt.fit_transform(x_train)
x_test_transform = pt.transform(x_test)

In [64]:
train_accu = []
test_accu = []

In [65]:
#Linear Regression with Transformation Approach
lr = LinearRegression()
lr.fit(x_train_transform, y_train)
y_pred_train_ = lr.predict(x_train_transform)
y_pred_test_ = lr.predict(x_test_transform)
train_accu.append(r2_score(y_train,y_pred_train_))
test_accu.append(r2_score(y_test,y_pred_test_))

regr = RandomForestRegressor()
regr.fit(x_train_transform, y_train)
y_pred_train_regr= regr.predict(x_train_transform)
y_pred_test_regr = regr.predict(x_test_transform)
train_accu.append(r2_score(y_train,y_pred_train_regr))
test_accu.append(r2_score(y_test,y_pred_test_regr))

svr = SVR()
svr.fit(x_train_transform, y_train)
y_pred_train_svr= svr.predict(x_train_transform)
y_pred_test_svr = svr.predict(x_test_transform)
train_accu.append(r2_score(y_train,y_pred_train_svr))
test_accu.append(r2_score(y_test,y_pred_test_svr))

cat = CatBoostRegressor(learning_rate=0.15)
cat.fit(x_train_transform, y_train)
y_pred_train_cat = cat.predict(x_train_transform)
y_pred_test_cat = cat.predict(x_test_transform)
train_accu.append(r2_score(y_train,y_pred_train_cat))
test_accu.append(r2_score(y_test,y_pred_test_cat))

dtgr = DecisionTreeRegressor()
dtgr.fit(x_train_transform, y_train)
y_pred_train_dtgr= dtgr.predict(x_train_transform)
y_pred_test_dtgr = dtgr.predict(x_test_transform)
train_accu.append(r2_score(y_train,y_pred_train_dtgr))
test_accu.append(r2_score(y_test,y_pred_test_dtgr))

gbgr = GradientBoostingRegressor()
gbgr.fit(x_train_transform, y_train)
y_pred_train_gbgr= gbgr.predict(x_train_transform)
y_pred_test_gbgr = gbgr.predict(x_test_transform)
train_accu.append(r2_score(y_train,y_pred_train_gbgr))
test_accu.append(r2_score(y_test,y_pred_test_gbgr))

knn = KNeighborsRegressor()
knn.fit(x_train_transform, y_train)
y_pred_train_knn= knn.predict(x_train_transform)
y_pred_test_knn = knn.predict(x_test_transform)
train_accu.append(r2_score(y_train,y_pred_train_knn))
test_accu.append(r2_score(y_test,y_pred_test_knn))

0:	learn: 768.9147307	total: 4.4ms	remaining: 4.4s
1:	learn: 677.1595341	total: 7.44ms	remaining: 3.71s
2:	learn: 599.4172097	total: 10.4ms	remaining: 3.45s
3:	learn: 529.5140939	total: 13.7ms	remaining: 3.41s
4:	learn: 471.9140983	total: 26.1ms	remaining: 5.19s
5:	learn: 424.8997267	total: 36.9ms	remaining: 6.12s
6:	learn: 383.3520063	total: 47.4ms	remaining: 6.72s
7:	learn: 347.1716002	total: 57.3ms	remaining: 7.11s
8:	learn: 316.9577535	total: 64.6ms	remaining: 7.12s
9:	learn: 290.4649161	total: 71.3ms	remaining: 7.05s
10:	learn: 270.7035274	total: 77.7ms	remaining: 6.99s
11:	learn: 252.0805862	total: 85.7ms	remaining: 7.06s
12:	learn: 239.4378959	total: 92.4ms	remaining: 7.02s
13:	learn: 224.6530692	total: 95.4ms	remaining: 6.72s
14:	learn: 210.6230762	total: 98.1ms	remaining: 6.44s
15:	learn: 198.7972075	total: 101ms	remaining: 6.21s
16:	learn: 191.9885782	total: 104ms	remaining: 6.02s
17:	learn: 183.7583920	total: 107ms	remaining: 5.84s
18:	learn: 176.1492331	total: 110ms	remaini

176:	learn: 35.4075099	total: 1.05s	remaining: 4.88s
177:	learn: 35.3083145	total: 1.06s	remaining: 4.89s
178:	learn: 34.9959933	total: 1.06s	remaining: 4.89s
179:	learn: 34.8230292	total: 1.07s	remaining: 4.89s
180:	learn: 34.7555022	total: 1.08s	remaining: 4.88s
181:	learn: 34.5069566	total: 1.09s	remaining: 4.89s
182:	learn: 34.4465014	total: 1.09s	remaining: 4.89s
183:	learn: 34.3321544	total: 1.1s	remaining: 4.88s
184:	learn: 34.2609717	total: 1.11s	remaining: 4.88s
185:	learn: 34.1519262	total: 1.11s	remaining: 4.88s
186:	learn: 33.8071714	total: 1.12s	remaining: 4.89s
187:	learn: 33.6504673	total: 1.13s	remaining: 4.89s
188:	learn: 33.4585430	total: 1.14s	remaining: 4.87s
189:	learn: 33.3655889	total: 1.14s	remaining: 4.86s
190:	learn: 33.1828656	total: 1.14s	remaining: 4.84s
191:	learn: 33.1162132	total: 1.15s	remaining: 4.82s
192:	learn: 32.9538282	total: 1.15s	remaining: 4.8s
193:	learn: 32.7615163	total: 1.15s	remaining: 4.79s
194:	learn: 32.7443249	total: 1.15s	remaining: 4

352:	learn: 16.8590113	total: 2.12s	remaining: 3.88s
353:	learn: 16.7851773	total: 2.12s	remaining: 3.87s
354:	learn: 16.7106949	total: 2.12s	remaining: 3.86s
355:	learn: 16.6664389	total: 2.13s	remaining: 3.85s
356:	learn: 16.6258576	total: 2.13s	remaining: 3.83s
357:	learn: 16.6000608	total: 2.13s	remaining: 3.83s
358:	learn: 16.5662157	total: 2.14s	remaining: 3.82s
359:	learn: 16.5278978	total: 2.15s	remaining: 3.82s
360:	learn: 16.5176251	total: 2.15s	remaining: 3.81s
361:	learn: 16.4307769	total: 2.16s	remaining: 3.81s
362:	learn: 16.3744049	total: 2.17s	remaining: 3.81s
363:	learn: 16.3630667	total: 2.18s	remaining: 3.8s
364:	learn: 16.3213516	total: 2.18s	remaining: 3.8s
365:	learn: 16.2683973	total: 2.19s	remaining: 3.79s
366:	learn: 16.2047150	total: 2.2s	remaining: 3.79s
367:	learn: 16.1732871	total: 2.2s	remaining: 3.79s
368:	learn: 16.0980359	total: 2.21s	remaining: 3.78s
369:	learn: 16.0601442	total: 2.22s	remaining: 3.78s
370:	learn: 16.0016756	total: 2.22s	remaining: 3.7

524:	learn: 9.9417141	total: 3.18s	remaining: 2.87s
525:	learn: 9.9220029	total: 3.19s	remaining: 2.87s
526:	learn: 9.8836492	total: 3.19s	remaining: 2.87s
527:	learn: 9.8488942	total: 3.2s	remaining: 2.86s
528:	learn: 9.8436381	total: 3.2s	remaining: 2.85s
529:	learn: 9.8309525	total: 3.21s	remaining: 2.84s
530:	learn: 9.7902509	total: 3.21s	remaining: 2.84s
531:	learn: 9.7793275	total: 3.21s	remaining: 2.83s
532:	learn: 9.7743776	total: 3.22s	remaining: 2.82s
533:	learn: 9.7471867	total: 3.23s	remaining: 2.82s
534:	learn: 9.6897126	total: 3.23s	remaining: 2.81s
535:	learn: 9.6622650	total: 3.24s	remaining: 2.81s
536:	learn: 9.6372274	total: 3.25s	remaining: 2.8s
537:	learn: 9.6233047	total: 3.25s	remaining: 2.79s
538:	learn: 9.5798061	total: 3.26s	remaining: 2.79s
539:	learn: 9.5248161	total: 3.27s	remaining: 2.79s
540:	learn: 9.4888440	total: 3.28s	remaining: 2.78s
541:	learn: 9.4718689	total: 3.28s	remaining: 2.77s
542:	learn: 9.4585620	total: 3.29s	remaining: 2.77s
543:	learn: 9.4

696:	learn: 6.4651432	total: 4.24s	remaining: 1.84s
697:	learn: 6.4331262	total: 4.25s	remaining: 1.84s
698:	learn: 6.4053657	total: 4.25s	remaining: 1.83s
699:	learn: 6.3813690	total: 4.26s	remaining: 1.83s
700:	learn: 6.3722784	total: 4.27s	remaining: 1.82s
701:	learn: 6.3505650	total: 4.27s	remaining: 1.81s
702:	learn: 6.3393546	total: 4.27s	remaining: 1.8s
703:	learn: 6.3327299	total: 4.28s	remaining: 1.8s
704:	learn: 6.3162819	total: 4.29s	remaining: 1.79s
705:	learn: 6.2963385	total: 4.3s	remaining: 1.79s
706:	learn: 6.2943100	total: 4.3s	remaining: 1.78s
707:	learn: 6.2774986	total: 4.31s	remaining: 1.78s
708:	learn: 6.2693757	total: 4.32s	remaining: 1.77s
709:	learn: 6.2517731	total: 4.33s	remaining: 1.77s
710:	learn: 6.2264123	total: 4.33s	remaining: 1.76s
711:	learn: 6.2046347	total: 4.34s	remaining: 1.75s
712:	learn: 6.1925075	total: 4.34s	remaining: 1.75s
713:	learn: 6.1821760	total: 4.35s	remaining: 1.74s
714:	learn: 6.1592925	total: 4.36s	remaining: 1.74s
715:	learn: 6.14

870:	learn: 4.5083672	total: 5.33s	remaining: 789ms
871:	learn: 4.5008534	total: 5.33s	remaining: 783ms
872:	learn: 4.4992567	total: 5.34s	remaining: 777ms
873:	learn: 4.4975991	total: 5.35s	remaining: 771ms
874:	learn: 4.4834916	total: 5.35s	remaining: 765ms
875:	learn: 4.4789260	total: 5.36s	remaining: 759ms
876:	learn: 4.4727964	total: 5.37s	remaining: 753ms
877:	learn: 4.4625203	total: 5.37s	remaining: 747ms
878:	learn: 4.4577069	total: 5.38s	remaining: 740ms
879:	learn: 4.4531214	total: 5.38s	remaining: 734ms
880:	learn: 4.4494728	total: 5.38s	remaining: 727ms
881:	learn: 4.4481441	total: 5.39s	remaining: 721ms
882:	learn: 4.4374779	total: 5.39s	remaining: 714ms
883:	learn: 4.4257319	total: 5.39s	remaining: 708ms
884:	learn: 4.4219303	total: 5.4s	remaining: 701ms
885:	learn: 4.4112572	total: 5.41s	remaining: 696ms
886:	learn: 4.3992380	total: 5.42s	remaining: 690ms
887:	learn: 4.3885731	total: 5.42s	remaining: 684ms
888:	learn: 4.3848279	total: 5.43s	remaining: 678ms
889:	learn: 4

In [66]:
algorithm = ['LinearRegression','RandomForestRegressor','SupprtVectorRegressor','CatBoostRegressor','DecisionTreeRegressor','GradientBoostingRegressor','KNeighborsRegressor']
accu_data = {'Training Accuracy':train_accu,'Test Accuracy':test_accu}
result1 = pd.DataFrame(accu_data, index = algorithm)
result1

# with power transform

Unnamed: 0,Training Accuracy,Test Accuracy
LinearRegression,0.260056,0.250264
RandomForestRegressor,0.992916,0.907883
SupprtVectorRegressor,0.003274,0.002996
CatBoostRegressor,0.999982,0.955178
DecisionTreeRegressor,1.0,0.861919
GradientBoostingRegressor,0.997193,0.905052
KNeighborsRegressor,0.984585,0.974638


In [68]:
models = [LinearRegression(copy_X= True, fit_intercept= True, positive=False), 
          RandomForestRegressor(max_depth=20, max_features='log2', min_samples_split= 2, n_estimators=50), 
          SVR(C=100, gamma='scale', kernel= 'poly'), 
          CatBoostRegressor(depth=3, iterations=150, l2_leaf_reg=1, learning_rate=0.2), 
          DecisionTreeRegressor(max_depth= 7, max_features= 1.0, max_leaf_nodes=None, min_samples_leaf= 1, min_weight_fraction_leaf=0.1, splitter= 'best'), 
          GradientBoostingRegressor(learning_rate=0.1, max_depth=5, min_samples_leaf=1, min_samples_split=4, n_estimators=100), 
          KNeighborsRegressor(algorithm = 'ball_tree', n_neighbors=3, weights= 'distance')]
algorithm = [model.__class__.__name__ for model in models]

accu_data = {'Training Accuracy': [model.fit(x_train_transform, y_train).score(x_train_transform, y_train) for model in models],
             'Test Accuracy': [model.score(x_test_transform, y_test) for model in models]}
result2 = pd.DataFrame(accu_data, index=algorithm)

0:	learn: 752.0309650	total: 1.65ms	remaining: 245ms
1:	learn: 661.1410006	total: 3.05ms	remaining: 226ms
2:	learn: 559.2938659	total: 4.31ms	remaining: 211ms
3:	learn: 484.7406384	total: 5.64ms	remaining: 206ms
4:	learn: 442.8354289	total: 7.12ms	remaining: 206ms
5:	learn: 410.7587074	total: 8.49ms	remaining: 204ms
6:	learn: 382.4065118	total: 9.99ms	remaining: 204ms
7:	learn: 364.3672370	total: 11.3ms	remaining: 201ms
8:	learn: 328.8523525	total: 12.8ms	remaining: 201ms
9:	learn: 316.5145420	total: 14.2ms	remaining: 198ms
10:	learn: 306.4140599	total: 15.5ms	remaining: 196ms
11:	learn: 278.9190416	total: 16.9ms	remaining: 194ms
12:	learn: 266.4913200	total: 18.2ms	remaining: 191ms
13:	learn: 257.9380511	total: 19.4ms	remaining: 189ms
14:	learn: 253.1675689	total: 20.9ms	remaining: 188ms
15:	learn: 249.4775599	total: 22.3ms	remaining: 187ms
16:	learn: 229.1185353	total: 23.7ms	remaining: 186ms
17:	learn: 222.9420460	total: 25.3ms	remaining: 185ms
18:	learn: 219.7682106	total: 26.6ms	r

In [69]:
result2

Unnamed: 0,Training Accuracy,Test Accuracy
LinearRegression,0.260056,0.250264
RandomForestRegressor,0.995687,0.962662
SVR,0.686029,0.668723
CatBoostRegressor,0.994895,0.969462
DecisionTreeRegressor,0.069817,0.065395
GradientBoostingRegressor,0.999948,0.907088
KNeighborsRegressor,1.0,0.968674


In [70]:
df = pd.read_csv('crop_yield.csv')
category_columns = df.select_dtypes(include=['object']).columns

for col in category_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
X = df.drop(['Yield','Crop_Year'], axis=1)
y = df['Yield']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pt = PowerTransformer(method='yeo-johnson')
x_train_transform = pt.fit_transform(x_train)
x_test_transform = pt.transform(x_test)

regr = RandomForestRegressor(max_depth=20, max_features='log2', min_samples_split= 2, n_estimators=50)
regr.fit(x_train_transform, y_train)
y_pred_train_regr= regr.predict(x_train_transform)
y_pred_test_regr = regr.predict(x_test_transform)

import joblib
joblib.dump(pt, 'power_transformer.joblib')
joblib.dump(regr,'rfmodel.joblib')

['rfmodel.joblib']

In [71]:
loaded_model = joblib.load('rfmodel.joblib')
loaded_pt = joblib.load('power_transformer.joblib')

new_data = pd.DataFrame({
    'Crop': [2,4,1],
    'Season': [4,1,6],
    'State': [7,2,6],
    'Area': [10000, 5000, 8000],
    'Production': [5000, 3000, 4000],
    'Annual_Rainfall': [1500, 1200, 1400],
    'Fertilizer': [2000000, 1000000, 1500000],
    'Pesticide': [1000, 500, 800]
})
    
x_test_transform_loaded = loaded_pt.transform(new_data)
result = loaded_model.predict(x_test_transform_loaded)

print(result)


[5.86757802 0.7304697  6.9419838 ]
