In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline
plt.rcParams['font.size']=14
plt.rcParams['axes.titlepad']= 8
plt.rcParams['axes.titlesize']= 'medium'
plt.rcParams['axes.grid']=True
plt.rcParams['figure.figsize'] = (5,5)
plt.rcParams['axes.facecolor'] = 'white'

In [3]:
data_total = pd.read_csv('Puri_CCE_Extract_Two_Stage_250m.csv')
data = data_total.copy()

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 30 columns):
S_No_                        161 non-null int64
Long                         161 non-null float64
Lat                          161 non-null float64
USIN                         161 non-null int64
State                        161 non-null object
District                     161 non-null object
Block_Mand                   161 non-null object
Village                      161 non-null object
CCE_Date                     161 non-null object
Geo_Coordi                   161 non-null object
Target_Cro                   161 non-null object
CCE_Plot_S                   161 non-null object
QC_Status                    161 non-null object
Wet_Weight                   161 non-null float64
Dry_Weight                   161 non-null float64
Yield_kg_h                   161 non-null int64
CCCI_Puri_250                161 non-null float64
CWSI_Puri_250                161 non-null float64
EVI_241_OR

In [5]:
corr = data.corr()

In [6]:
corr.to_excel("Puri_Corr.xlsx")

In [7]:
corr

Unnamed: 0,S_No_,Long,Lat,USIN,Wet_Weight,Dry_Weight,Yield_kg_h,CCCI_Puri_250,CWSI_Puri_250,EVI_241_OR_Puri,...,NDVI_241_OR_Puri,NDVI_273_OR_Puri,puri_07_12_sept_sm_250m,Puri_7_11_october_sm_250m,Puri_max_GS_250,Puri_max_VS_250,Puri_min_GS_250,Puri_min_VS_250,Puri_rf_GS_250,Puri_rf_VS_250
S_No_,1.0,-0.644129,0.029206,-0.999381,-0.005032,-0.005032,-0.004976,0.352603,0.445606,-0.15266,...,-0.047113,0.320103,0.074044,0.140117,-0.474683,0.600554,-0.577051,-0.575528,-0.072877,-0.578286
Long,-0.644129,1.0,-0.138268,0.640959,0.191816,0.191816,0.191791,-0.487011,-0.53267,0.376859,...,0.264316,-0.525052,0.038512,-0.082764,0.738492,-0.849142,0.873696,0.85932,0.16606,0.802077
Lat,0.029206,-0.138268,1.0,-0.024677,0.368293,0.368293,0.368365,-0.030706,0.39225,-0.034456,...,-0.029749,0.028658,0.249915,0.23509,-0.7323,0.523131,-0.559437,-0.569989,-0.92888,-0.4975
USIN,-0.999381,0.640959,-0.024677,1.0,0.004258,0.004258,0.004199,-0.348658,-0.43852,0.154511,...,0.0497,-0.321512,-0.07441,-0.141736,0.470236,-0.594736,0.572537,0.570894,0.066007,0.571134
Wet_Weight,-0.005032,0.191816,0.368293,0.004258,1.0,1.0,1.0,-0.098155,0.15796,0.146592,...,0.072473,-0.053312,0.065694,0.082606,-0.140157,0.039457,-0.036173,-0.048823,-0.271449,0.013234
Dry_Weight,-0.005032,0.191816,0.368293,0.004258,1.0,1.0,1.0,-0.098155,0.15796,0.146592,...,0.072473,-0.053312,0.065694,0.082606,-0.140157,0.039457,-0.036173,-0.048823,-0.271449,0.013234
Yield_kg_h,-0.004976,0.191791,0.368365,0.004199,1.0,1.0,1.0,-0.098128,0.157967,0.146554,...,0.072426,-0.053241,0.065709,0.082657,-0.140227,0.039498,-0.03623,-0.048879,-0.271511,0.013207
CCCI_Puri_250,0.352603,-0.487011,-0.030706,-0.348658,-0.098155,-0.098155,-0.098128,1.0,0.17209,-0.327092,...,-0.271338,0.247325,-0.069707,0.044105,-0.251671,0.381057,-0.350944,-0.341223,-0.009401,-0.388692
CWSI_Puri_250,0.445606,-0.53267,0.39225,-0.43852,0.15796,0.15796,0.157967,0.17209,1.0,0.173188,...,0.219015,0.185778,0.193338,0.308266,-0.655399,0.660194,-0.663541,-0.669487,-0.404549,-0.617477
EVI_241_OR_Puri,-0.15266,0.376859,-0.034456,0.154511,0.146592,0.146592,0.146554,-0.327092,0.173188,1.0,...,0.920029,-0.246754,0.250438,0.208461,0.18676,-0.221138,0.244944,0.227083,-0.014434,0.194982


In [8]:
data.drop(columns=['Long','State','District','Block_Mand','Village','CCE_Date','Target_Cro',
                   'QC_Status','CCE_Plot_S','Geo_Coordi','Lat','S_No_','USIN','Wet_Weight','Dry_Weight'],axis=1,inplace=True)

In [9]:
data['Avg_max'] = (data.Puri_max_GS_250+data.Puri_max_VS_250)/2
data['Avg_min'] = (data.Puri_min_GS_250+data.Puri_min_VS_250)/2
data['Sum_rf'] = (data.Puri_rf_GS_250+data.Puri_rf_VS_250)

In [10]:
### Thresholding is applied for outlier treatment
low_thresh = np.mean(data.Yield_kg_h)-2*np.std(data.Yield_kg_h)
upp_thresh = np.mean(data.Yield_kg_h)+2*np.std(data.Yield_kg_h)

In [11]:
low_thresh

3011.4236756732207

In [12]:
upp_thresh

5310.501790165288

In [13]:
selected_data = data[np.logical_and(data.Yield_kg_h > low_thresh, data.Yield_kg_h < upp_thresh)]

In [14]:
X = selected_data.drop(['Yield_kg_h'],axis=1)
Y = selected_data.Yield_kg_h

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152 entries, 0 to 160
Data columns (total 17 columns):
CCCI_Puri_250                152 non-null float64
CWSI_Puri_250                152 non-null float64
EVI_241_OR_Puri              152 non-null float64
EVI_273_OR_Puri              152 non-null float64
NDVI_241_OR_Puri             152 non-null float64
NDVI_273_OR_Puri             152 non-null float64
puri_07_12_sept_sm_250m      152 non-null float64
Puri_7_11_october_sm_250m    152 non-null float64
Puri_max_GS_250              152 non-null float64
Puri_max_VS_250              152 non-null float64
Puri_min_GS_250              152 non-null float64
Puri_min_VS_250              152 non-null float64
Puri_rf_GS_250               152 non-null float64
Puri_rf_VS_250               152 non-null float64
Avg_max                      152 non-null float64
Avg_min                      152 non-null float64
Sum_rf                       152 non-null float64
dtypes: float64(17)
memory usage: 21.4 KB


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 98, test_size=0.1)
rf = RandomForestRegressor(n_estimators=60, max_depth=8, min_samples_split=3, random_state=23)
rf.fit(X_train, Y_train)
x_pred = rf.predict(X_train)
y_pred = rf.predict(X_test)
print('Train score:',r2_score(x_pred, Y_train))
print('Test score:',r2_score(Y_test, y_pred))
devs = ((y_pred-Y_test)/Y_test)*100
print(np.mean(np.abs(devs)))

Train score: 0.7131172498547554
Test score: 0.7066564943268641
5.619659828591161


In [21]:
import xgboost as xgb

In [22]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.65, learning_rate = .64,
                max_depth = 6, alpha = 7, n_estimators = 30)
xg_reg.fit(X_train,Y_train)

x_pred_xgb = xg_reg.predict(X_train)
y_pred_xgb = xg_reg.predict(X_test)
print("Train score: ", r2_score(x_pred_xgb,Y_train))
print("Test score: ",r2_score(Y_test, y_pred_xgb))
devs = ((y_pred_xgb-Y_test)/Y_test)*100
print(np.mean(np.abs(devs)))

Train score:  0.9946492281646088
Test score:  0.7126908468365947
5.04022988432568


## -----Final Model is RF-----

In [23]:
output = data_total.iloc[X_test.index].copy()
output['Avg_max'] = X_test.Avg_max
output['Avg_min'] = X_test.Avg_min
output['Sum_rf'] = X_test.Sum_rf
output['Prediction'] = y_pred
output.reset_index(drop=True, inplace=True)
output.to_excel("Puri_250M_Predictions.xlsx")

In [106]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 19 columns):
Yield_kg_h                   16 non-null int64
CCCI_Puri_250                16 non-null float64
CWSI_Puri_250                16 non-null float64
EVI_241_OR_Puri              16 non-null float64
EVI_273_OR_Puri              16 non-null float64
NDVI_241_OR_Puri             16 non-null float64
NDVI_273_OR_Puri             16 non-null float64
puri_07_12_sept_sm_250m      16 non-null float64
Puri_7_11_october_sm_250m    16 non-null float64
Puri_max_GS_250              16 non-null float64
Puri_max_VS_250              16 non-null float64
Puri_min_GS_250              16 non-null float64
Puri_min_VS_250              16 non-null float64
Puri_rf_GS_250               16 non-null float64
Puri_rf_VS_250               16 non-null float64
Avg_max                      16 non-null float64
Avg_min                      16 non-null float64
Sum_rf                       16 non-null float64
Prediction       

###  Final Predictions

In [107]:
data2 = pd.read_csv('250M_INPUT_FOR_YIELD_MAP/Puri_CWSI_250.csv')
data1 = data2.copy()

In [108]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35453 entries, 0 to 35452
Data columns (total 19 columns):
FID          35453 non-null int64
pointid      35453 non-null int64
grid_code    35453 non-null float64
POINT_X      35453 non-null float64
POINT_Y      35453 non-null float64
CCCI         35453 non-null float64
EVI_241      35453 non-null float64
EVI_273      35453 non-null float64
NDVI_241     35453 non-null float64
NDVI_273     35453 non-null float64
SM_SEP       35453 non-null float64
SM_OCT       35453 non-null float64
MAX_GS       35453 non-null float64
MAX_VS       35453 non-null float64
MIN_GS       35453 non-null float64
MIN_VS       35453 non-null float64
RF_GS        35453 non-null float64
RF_VS        35453 non-null float64
CWSI         35453 non-null float64
dtypes: float64(17), int64(2)
memory usage: 5.1 MB


In [109]:
selected_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152 entries, 0 to 160
Data columns (total 18 columns):
Yield_kg_h                   152 non-null int64
CCCI_Puri_250                152 non-null float64
CWSI_Puri_250                152 non-null float64
EVI_241_OR_Puri              152 non-null float64
EVI_273_OR_Puri              152 non-null float64
NDVI_241_OR_Puri             152 non-null float64
NDVI_273_OR_Puri             152 non-null float64
puri_07_12_sept_sm_250m      152 non-null float64
Puri_7_11_october_sm_250m    152 non-null float64
Puri_max_GS_250              152 non-null float64
Puri_max_VS_250              152 non-null float64
Puri_min_GS_250              152 non-null float64
Puri_min_VS_250              152 non-null float64
Puri_rf_GS_250               152 non-null float64
Puri_rf_VS_250               152 non-null float64
Avg_max                      152 non-null float64
Avg_min                      152 non-null float64
Sum_rf                       152 non-null float

In [110]:
data1['Avg_max'] = (data1.MAX_GS+data1.MAX_VS)/2
data1['Avg_min'] = (data1.MIN_GS+data1.MIN_VS)/2
data1['Sum_rf'] = (data1.RF_GS+data1.RF_VS)

In [111]:
data1.drop(['FID', 'pointid', 'grid_code', 'POINT_X', 'POINT_Y'],axis=1, inplace=True)

In [112]:
data1.rename(columns={'SM_OCT':'Puri_7_11_october_sm_250m', 'SM_SEP':'puri_07_12_sept_sm_250m',
               'CCCI':'CCCI_Puri_250', 'CWSI':'CWSI_Puri_250', 
                      'MAX_GS':'Puri_max_GS_250','MAX_VS':'Puri_max_VS_250',
                      'MIN_GS':'Puri_min_GS_250','MIN_VS':'Puri_min_VS_250',
                      'RF_GS':'Puri_rf_GS_250','RF_VS':'Puri_rf_VS_250',
               'EVI_241':'EVI_241_OR_Puri', 'EVI_273':'EVI_273_OR_Puri',
               'NDVI_241':'NDVI_241_OR_Puri', 'NDVI_273':'NDVI_273_OR_Puri'},inplace=True)

In [113]:
X_train.columns

Index(['CCCI_Puri_250', 'CWSI_Puri_250', 'EVI_241_OR_Puri', 'EVI_273_OR_Puri',
       'NDVI_241_OR_Puri', 'NDVI_273_OR_Puri', 'puri_07_12_sept_sm_250m',
       'Puri_7_11_october_sm_250m', 'Puri_max_GS_250', 'Puri_max_VS_250',
       'Puri_min_GS_250', 'Puri_min_VS_250', 'Puri_rf_GS_250',
       'Puri_rf_VS_250', 'Avg_max', 'Avg_min', 'Sum_rf'],
      dtype='object')

In [114]:
data1 = data1[['CCCI_Puri_250', 'CWSI_Puri_250', 'EVI_241_OR_Puri', 'EVI_273_OR_Puri',
       'NDVI_241_OR_Puri', 'NDVI_273_OR_Puri', 'puri_07_12_sept_sm_250m',
       'Puri_7_11_october_sm_250m', 'Puri_max_GS_250', 'Puri_max_VS_250',
       'Puri_min_GS_250', 'Puri_min_VS_250', 'Puri_rf_GS_250',
       'Puri_rf_VS_250', 'Avg_max', 'Avg_min', 'Sum_rf']]

In [115]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35453 entries, 0 to 35452
Data columns (total 17 columns):
CCCI_Puri_250                35453 non-null float64
CWSI_Puri_250                35453 non-null float64
EVI_241_OR_Puri              35453 non-null float64
EVI_273_OR_Puri              35453 non-null float64
NDVI_241_OR_Puri             35453 non-null float64
NDVI_273_OR_Puri             35453 non-null float64
puri_07_12_sept_sm_250m      35453 non-null float64
Puri_7_11_october_sm_250m    35453 non-null float64
Puri_max_GS_250              35453 non-null float64
Puri_max_VS_250              35453 non-null float64
Puri_min_GS_250              35453 non-null float64
Puri_min_VS_250              35453 non-null float64
Puri_rf_GS_250               35453 non-null float64
Puri_rf_VS_250               35453 non-null float64
Avg_max                      35453 non-null float64
Avg_min                      35453 non-null float64
Sum_rf                       35453 non-null float64
dtypes:

In [116]:
data1[data1 == -9999] = np.NaN

In [117]:
data1.dropna(inplace=True)

In [118]:
final_preds = rf.predict(data1)

In [119]:
data1['POINT_X'] = data2.iloc[data1.index][['POINT_X']]
data1['POINT_Y'] = data2.iloc[data1.index][['POINT_Y']]

In [120]:
data1['Prediction'] = final_preds

In [121]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32863 entries, 0 to 35452
Data columns (total 20 columns):
CCCI_Puri_250                32863 non-null float64
CWSI_Puri_250                32863 non-null float64
EVI_241_OR_Puri              32863 non-null float64
EVI_273_OR_Puri              32863 non-null float64
NDVI_241_OR_Puri             32863 non-null float64
NDVI_273_OR_Puri             32863 non-null float64
puri_07_12_sept_sm_250m      32863 non-null float64
Puri_7_11_october_sm_250m    32863 non-null float64
Puri_max_GS_250              32863 non-null float64
Puri_max_VS_250              32863 non-null float64
Puri_min_GS_250              32863 non-null float64
Puri_min_VS_250              32863 non-null float64
Puri_rf_GS_250               32863 non-null float64
Puri_rf_VS_250               32863 non-null float64
Avg_max                      32863 non-null float64
Avg_min                      32863 non-null float64
Sum_rf                       32863 non-null float64
POINT_X

In [122]:
data1.to_csv('250M_INPUT_FOR_YIELD_MAP/Puri_250M_Predictions.csv')

In [4]:
np.mean(data.Yield_kg_h)

4160.962732919254

In [124]:
np.mean(final_preds)

4040.6398389679584