In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline
plt.rcParams['font.size']=14
plt.rcParams['axes.titlepad']= 8
plt.rcParams['axes.titlesize']= 'medium'
plt.rcParams['axes.grid']=True
plt.rcParams['figure.figsize'] = (5,5)
plt.rcParams['axes.facecolor'] = 'white'

In [3]:
data_total = pd.read_csv('Barabanki_CCE_Extract_Two_Stage_250m.csv')
data = data_total.copy()

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 28 columns):
Long                        160 non-null float64
Lat                         160 non-null float64
S_No_                       160 non-null int64
USIN                        160 non-null int64
State                       160 non-null object
District                    160 non-null object
Block_Mand                  160 non-null object
Village                     160 non-null object
CCE_Date                    160 non-null object
Target_Cro                  160 non-null object
QC_Status                   160 non-null object
Wet_Weight                  160 non-null float64
Dry_Weight                  160 non-null float64
Yield_kg_h                  160 non-null float64
Bara_Banki_max_GS           160 non-null float64
Bara_Banki_max_VS           160 non-null float64
Bara_Banki_min_GS           160 non-null float64
Bara_Banki_min_VS           160 non-null float64
Bara_Banki_rf_GS        

In [5]:
corr = data.corr()

In [6]:
corr.to_excel('Barabanki_Corr.xlsx')

In [7]:
data.drop(data[data.Yield_kg_h < 500].index, inplace=True, axis=0)

In [8]:
X = data[['Bara_Banki_max_GS','Bara_Banki_min_GS','Bara_Banki_min_VS','Barabanki_14_aug_sm_250m','CWSI_Barabanki','EVI_273_UP_Bara','NDVI_273_UP_Bara']]
Y = data.Yield_kg_h

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159 entries, 0 to 159
Data columns (total 7 columns):
Bara_Banki_max_GS           159 non-null float64
Bara_Banki_min_GS           159 non-null float64
Bara_Banki_min_VS           159 non-null float64
Barabanki_14_aug_sm_250m    159 non-null float64
CWSI_Barabanki              159 non-null float64
EVI_273_UP_Bara             159 non-null float64
NDVI_273_UP_Bara            159 non-null float64
dtypes: float64(7)
memory usage: 9.9 KB


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1, test_size=0.1)
rf = RandomForestRegressor(n_estimators=90, max_depth=12, min_samples_split=3, random_state=37)
rf.fit(X_train, Y_train)
x_pred = rf.predict(X_train)
y_pred = rf.predict(X_test)
print('Train score:', r2_score(x_pred, Y_train))
print('Test score:', r2_score(Y_test, y_pred))
devs = ((y_pred-Y_test)/Y_test)*100
print(devs)

Train score: 0.6353000467614911
Test score: 0.602672022028329
29     38.678496
42    -16.013111
14     28.919938
91    -13.231019
81     49.477877
19     -3.642082
44     -0.329451
11    -13.363883
40      0.166630
97     18.834900
89     -5.680184
94    -29.564735
73     -5.895196
105    22.862678
59     -4.300727
90    -11.074580
Name: Yield_kg_h, dtype: float64


In [13]:
import xgboost as xgb

In [14]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.8, learning_rate = 0.87,
            max_depth = 5, alpha = 6, n_estimators = 100)
xg_reg.fit(X_train,Y_train)

x_pred_xgb = xg_reg.predict(X_train)
y_pred_xgb = xg_reg.predict(X_test)
print("Train score: ", r2_score(x_pred_xgb,Y_train))
print("Test score: ", r2_score(Y_test, y_pred_xgb))

Train score:  0.9999943156765183
Test score:  0.5200819936882792


### Analysis

In [15]:
low_thresh = np.mean(data.Yield_kg_h)-2*np.std(data.Yield_kg_h)
upp_thresh = np.mean(data.Yield_kg_h)+2*np.std(data.Yield_kg_h)

In [16]:
selected_Data = data[np.logical_and(data.Yield_kg_h > low_thresh, data.Yield_kg_h < upp_thresh)].copy()

In [17]:
selected_Data['Avg_max'] = (selected_Data.Bara_Banki_max_GS+selected_Data.Bara_Banki_max_VS)/2
selected_Data['Avg_min'] = (selected_Data.Bara_Banki_min_GS+selected_Data.Bara_Banki_min_VS)/2
selected_Data['Sum_rf'] = (selected_Data.Bara_Banki_rf_GS+selected_Data.Bara_Banki_rf_VS)

In [18]:
selected_Data.drop(columns=['Bara_Banki_max_GS','Bara_Banki_max_VS','Bara_Banki_min_GS','Bara_Banki_min_VS',
                           'Bara_Banki_rf_GS','Bara_Banki_rf_VS','Long','Lat','S_No_','USIN','State','District',
                           'Block_Mand','Village','CCE_Date','Target_Cro','QC_Status','Wet_Weight','Dry_Weight'], inplace=True)

In [19]:
X = selected_Data.drop(['Yield_kg_h'],axis=1)
Y = selected_Data.Yield_kg_h

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 46, test_size=0.1)
rf = RandomForestRegressor(n_estimators=25, max_depth=10, min_samples_split=3, random_state=10)
rf.fit(X_train, Y_train)
x_pred = rf.predict(X_train)
y_pred = rf.predict(X_test)
print('Train score:', r2_score(x_pred, Y_train))
print('Test score:', r2_score(Y_test, y_pred))
devs = ((y_pred-Y_test)/Y_test)*100
print(devs)

Train score: 0.6774061825799413
Test score: 0.6776274378459982
42    -13.790216
9      43.528023
25     57.082835
49    -21.024613
67     -0.042593
50     -9.354118
77     13.713099
5       9.075972
86     -6.901270
76     11.006376
108    34.806007
44     -1.723412
64      2.560178
79     23.945514
85     26.525685
128   -17.220249
Name: Yield_kg_h, dtype: float64


In [21]:
data.drop(['Bara_Banki_max_GS','Bara_Banki_max_VS','Bara_Banki_min_GS','Bara_Banki_min_VS',
                           'Bara_Banki_rf_GS','Bara_Banki_rf_VS','Long','Lat','S_No_','USIN','State','District',
                           'Block_Mand','Village','CCE_Date','Target_Cro','QC_Status','Wet_Weight','Dry_Weight'],
         axis=1,inplace=True)

In [22]:
output = data_total.iloc[X_test.index].copy()
output['Avg_max'] = X_test.Avg_max
output['Avg_min'] = X_test.Avg_min
output['Sum_rf'] = X_test.Sum_rf
output['Prediction'] = y_pred
output.reset_index(drop=True, inplace=True)
output.to_excel("Barabanki_250M_Predictions.xlsx")

### Final model is RF

### Final Predictions

In [28]:
data2 = pd.read_csv('250M_INPUT_FOR_YIELD_MAP/BARABANKI_250_sm_aug.txt')
data1 = data2.copy()

In [29]:
data1[data1 == -9999] = np.NaN
data1.dropna(inplace=True)

In [30]:
data1['Avg_max'] = (data1.MAX_GS_1+data1.MAX_VS)/2
data1['Avg_min'] = (data1.MIN_GS+data1.MIN_VS)/2
data1['Sum_rf'] = (data1.RF_GS+data1.RF_VS)

In [31]:
data1.drop(['FID', 'pointid', 'grid_code', 'POINT_X', 'POINT_Y', 'MAX_VS', 'MIN_GS',
       'MIN_VS', 'RF_GS', 'RF_VS','MAX_GS_1'],axis=1,inplace=True)

In [32]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64520 entries, 1089 to 102653
Data columns (total 11 columns):
SM_OCT      64520 non-null float64
SM_AUG      64520 non-null float64
CCCI        64520 non-null float64
CWSI        64520 non-null float64
EVI_241     64520 non-null float64
EVI_273     64520 non-null float64
NDVI_241    64520 non-null float64
NDVI_273    64520 non-null float64
Avg_max     64520 non-null float64
Avg_min     64520 non-null float64
Sum_rf      64520 non-null float64
dtypes: float64(11)
memory usage: 5.9 MB


In [33]:
data1.rename(columns={'SM_OCT':'barabanki_10_oct_250m_sm', 'SM_AUG':'Barabanki_14_aug_sm_250m',
                      'CCCI':'CCCI_Barabanki', 'CWSI':'CWSI_Barabanki',
                      'EVI_241':'EVI_241_UP_Bara', 'EVI_273':'EVI_273_UP_Bara',
                      'NDVI_241':'NDVI_241_UP_Bara', 'NDVI_273':'NDVI_273_UP_Bara'},inplace=True)

In [34]:
data1 = data1[['barabanki_10_oct_250m_sm', 'Barabanki_14_aug_sm_250m',
       'CCCI_Barabanki', 'CWSI_Barabanki', 'EVI_241_UP_Bara',
       'EVI_273_UP_Bara', 'NDVI_241_UP_Bara', 'NDVI_273_UP_Bara', 'Avg_max',
       'Avg_min', 'Sum_rf']]

In [35]:
final_pred = rf.predict(data1)

In [36]:
data1['POINT_X'] = data2.iloc[data1.index][['POINT_X']]
data1['POINT_Y'] = data2.iloc[data1.index][['POINT_Y']]

In [37]:
data1['Prediction'] = final_pred

In [38]:
data1.to_csv('250M_INPUT_FOR_YIELD_MAP/BARABANKI_250M_Predictions.csv')

In [4]:
np.mean(data.Yield_kg_h)

1915.30125

In [40]:
np.mean(final_pred)

1944.3775354390218