In [1]:
import pandas as pd
import numpy as np
import datetime
import scipy.integrate as integrate
import statsmodels.api as sm

In [16]:
df = pd.read_csv('kenya_maize_s1.csv')
# Subset to when crop is actively growing i.e. crop_cal != 0
df = df[df['crop_cal'] > 0] # TODO: How is the crop_cal calculated? WHat does its value mean?

# Rescale the NDVI to be between 0 and 1
df['ndvi'] = (df['ndvi'] - 50)/200

# Subset to the following columns: adm1_name, datetime, yield, ndvi
df = df[['adm1_name', 'datetime', 'yield', 'ndvi','Season']]

# Feature generation: perform a groupby to obtain a dataframe with maximum NDVI value for each admin 1 in each season
df_ml = df.groupby(['adm1_name','Season']).agg({'ndvi':'max','yield':'mean'}).reset_index()

# Drop rows that have missing NDVI or Yield data
df_ml = df_ml.dropna(subset=['yield', 'ndvi'])

# show dataframe
df_ml

Unnamed: 0,adm1_name,Season,ndvi,yield
2,central,2002.0,0.730903,1.212
3,central,2003.0,0.743151,1.061
4,central,2004.0,0.718109,0.623
5,central,2005.0,0.737948,1.037
6,central,2006.0,0.734041,1.104
...,...,...,...,...
180,western,2012.0,0.740084,2.359
181,western,2013.0,0.701376,2.236
182,western,2014.0,0.748567,2.459
183,western,2015.0,0.769604,2.470


In [18]:
# show the not nan values of ndvi
# len(df[df['ndvi'].notna()]['ndvi'])
len(df_ml)

103

In [19]:
# Split the dataframe such that we use 80% of the data for training and predict the remaining 20%
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_ml, test_size=0.2)
test

Unnamed: 0,adm1_name,Season,ndvi,yield
126,nyanza,2006.0,0.736374,1.709
159,rift_valley,2015.0,0.738744,2.647
60,eastern,2012.0,0.606084,0.925
123,nyanza,2003.0,0.743809,2.017
28,coast,2004.0,0.608895,0.903
104,north_eastern,2008.0,0.493608,0.556
157,rift_valley,2013.0,0.70392,3.017
13,central,2013.0,0.725478,1.306
152,rift_valley,2008.0,0.669679,2.679
8,central,2008.0,0.685026,0.629


Model 1: YIeld as function of Year

In [20]:
feature_names = ['Season']
X = train[feature_names].values       # training feature matrix
y = train['yield'].ravel()            # training target array
X_test = test[feature_names].values   # test feature matrix
y_test = test['yield'].ravel()        # test target array

In [24]:
# Instantiate a Linear Regression Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Fit the model
model.fit(X, y)
# Predict based on the model
y_pred = model.predict(X_test)
y_pred

array([1.35982665, 1.39108593, 1.38066617, 1.3494069 , 1.35288015,
       1.36677316, 1.38413942, 1.38413942, 1.36677316, 1.36677316,
       1.3563534 , 1.35288015, 1.36329991, 1.39108593, 1.39108593,
       1.3563534 , 1.37371966, 1.35982665, 1.36329991, 1.39455918,
       1.36329991])

In [26]:
# Show model intercept and coefficient
print(model.intercept_, model.coef_)
# Estimate model performance using different metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# print(f'Coefficient: {model.coef_}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f'Coefficient of Determination: {model.score(X, y)}')

-5.607518478495779 [0.00347325]
Coefficient: [0.00347325]
Mean Absolute Error: 0.8331166197638353
Mean Squared Error: 0.9290223020641494
Coefficient of Determination: 0.00037859808420970253


In [27]:
# Show OLS Regression Results using Statsmodel
import statsmodels.api as sm
X = sm.add_constant(X)
statsmodel=sm.OLS(y, X)
results = statsmodel.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.012
Method:                 Least Squares   F-statistic:                   0.03030
Date:                Wed, 22 Feb 2023   Prob (F-statistic):              0.862
Time:                        10:16:12   Log-Likelihood:                -94.844
No. Observations:                  82   AIC:                             193.7
Df Residuals:                      80   BIC:                             198.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.6075     40.084     -0.140      0.8

Model 2: Yield as a function of maximum NDVI

In [29]:
feature_names = ['ndvi']
X = train[feature_names].values       # training feature matrix
y = train['yield'].ravel()            # training target array
X_test = test[feature_names].values   # test feature matrix
y_test = test['yield'].ravel()        # test target array
# Fit the model
model.fit(X, y)
# Predict based on the model
y_pred = model.predict(X_test)
# y_pred
# Show OLS Regression Results using Statsmodel
import statsmodels.api as sm
X = sm.add_constant(X)
statsmodel=sm.OLS(y, X)
results = statsmodel.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.349
Model:                            OLS   Adj. R-squared:                  0.341
Method:                 Least Squares   F-statistic:                     42.84
Date:                Wed, 22 Feb 2023   Prob (F-statistic):           5.23e-09
Time:                        10:19:57   Log-Likelihood:                -77.276
No. Observations:                  82   AIC:                             158.6
Df Residuals:                      80   BIC:                             163.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.8094      0.491     -3.688      0.0

MOdel 3: Yield as a function of Area under the NDVI curve (AUC)

In [31]:
df = pd.read_csv('kenya_maize_s1.csv')
# Subset to when crop is actively growing i.e. crop_cal != 0
df = df[df['crop_cal'] > 1]

# Rescale the NDVI to be between 0 and 1
df['ndvi'] = (df['ndvi'] - 50)/200

# Subset to the following columns: adm1_name, datetime, yield, ndvi
df = df[['adm1_name', 'datetime', 'yield', 'ndvi', 'Season', 'Month']]
# Change datetime column from data type str to a datetime
df['datetime'] = pd.to_datetime(df['datetime'])

In [33]:
# Calculating Peak NDVI by month
df_ac = df.groupby(['adm1_name','Season','Month']).agg({'ndvi':'max','yield':'mean'}).reset_index()

In [35]:
# Inserting empty column to fill with AUC NDVI values later
df_ac.insert(4,'auc_NDVI','')

In [37]:
# Calculating accumulated NDVI
import scipy.integrate as integrate
df_temp = df_ac.groupby(['adm1_name','Season']).apply(lambda x: integrate.trapz(x['ndvi'].values, x = x['Month'].values)).reset_index()

In [39]:
df_ac = df_ac.groupby(['adm1_name','Season']).agg({'auc_NDVI':'max','yield':'mean'}).reset_index() # Grouping rows of NDVI by Region and Season to find NDVI.
df_ac['auc_NDVI'] = df_temp[0] # setting the column of AUC NDVI values in df_ac equal to the calculated accumulated NDVI values
round(df_ac, 2) # Rounding the whole dataset to two decimal places.

Unnamed: 0,adm1_name,Season,auc_NDVI,yield
0,central,2000.0,0.54,
1,central,2001.0,0.66,
2,central,2002.0,0.69,1.21
3,central,2003.0,0.73,1.06
4,central,2004.0,0.66,0.62
...,...,...,...,...
187,western,2019.0,3.58,
188,western,2020.0,3.45,
189,western,2021.0,3.44,
190,western,2022.0,,


In [40]:
df_ac = df_ac.dropna() # dropping missing values

In [42]:
# Instantiate a Linear Regression Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
# Split the dataframe such that we use 80% of the data for training and predict the remaining 20%
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_ac,test_size=0.2)
# test
feature_names = ['auc_NDVI']
X = train[feature_names].values       # training feature matrix
y = train['yield'].ravel()            # training target array
X_test = test[feature_names].values   # test feature matrix
y_test = test['yield'].ravel()        # test target array
# Fit the model
model.fit(X, y)
# Predict based on the model
y_pred = model.predict(X_test)
# y_pred

In [43]:
# Show OLS Regression Results using Statsmodel
import statsmodels.api as sm
X = sm.add_constant(X)
statsmodel=sm.OLS(y, X)
results = statsmodel.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.761
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     254.7
Date:                Wed, 22 Feb 2023   Prob (F-statistic):           1.40e-26
Time:                        10:31:13   Log-Likelihood:                -41.046
No. Observations:                  82   AIC:                             86.09
Df Residuals:                      80   BIC:                             90.91
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5459      0.072      7.538      0.0

Temporal Validation of Model 3

In [44]:
#import relevant functions
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

def temporal_validation(df, year):
  # Set aside 1 year as test data, leave all other years for training
  train = df[df['Season'] != year]
  test = df[df['Season'] == year]

  feature_names = ['auc_NDVI']
  X = train[feature_names].values       # training feature matrix
  y = train['yield'].ravel()            # training target array
  X_test = test[feature_names].values   # test feature matrix
  y_test = test['yield'].ravel()

  #Instantiate model
  model = LinearRegression() 

  # Fit the model
  model.fit(X, y)
  
  # Predict based on the model
  y_pred = model.predict(X_test)
  
  #Calculate all metrics in a dataframe friendly format for simple viewing
  stat_array = pd.DataFrame(index=[year],columns=['Intercept','Coefficient', 'Mean Absolute Error', 'Mean Squared Error', 'Coefficient of Determination'])
  stat_array.loc[year,'Intercept'] = model.intercept_
  stat_array.loc[year, 'Coefficient'] = model.coef_
  stat_array.loc[year, 'Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
  stat_array.loc[year, 'Mean Squared Error'] = mean_squared_error(y_test, y_pred)
  stat_array.loc[year, 'Coefficient of Determination'] =  model.score(X,y)

  return stat_array

In [45]:
#Calculate and view temporal validation results
years = np.arange(2002, 2017, 1) #array of all yields where there is data
results = pd.DataFrame() #empty dataframe that results of temp validation will be appended to
for year in years: 
  stat_array = temporal_validation(df_ac, year) #build model for each year and calculate statistics
  results = results.append(stat_array) 

#view results
results

  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)
  results = results.append(stat_array)


Unnamed: 0,Intercept,Coefficient,Mean Absolute Error,Mean Squared Error,Coefficient of Determination
2002,0.515862,[0.5285277915832717],0.193008,0.06263,0.725005
2003,0.528206,[0.5191791581398054],0.192467,0.041309,0.715934
2004,0.546998,[0.5156255475952891],0.292449,0.128679,0.719673
2005,0.554264,[0.5092996453334946],0.43465,0.228387,0.722916
2006,0.544305,[0.5118450859525021],0.315763,0.133698,0.71631
2007,0.535216,[0.5061193639236284],0.331157,0.245915,0.720435
2008,0.520129,[0.5355764093266043],0.591036,0.564599,0.767655
2009,0.526806,[0.5263923597268888],0.350956,0.177036,0.731899
2010,0.534186,[0.512837783494876],0.40061,0.303291,0.731522
2011,0.531048,[0.5220443914065789],0.23912,0.080157,0.718119
