# Example Model for Wheat Yield in AR, Santa Fe using Greenhub SDK
## Features: VI, Soil, Climate

In [13]:
import greenhub as gh
import pandas as pd
import pickle
import os

from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 1. Fetch Data

In [14]:
# Initialize greenhub sdk
gh.initialize("RejhCxnCdTKwDX1zK2lqIB24e1bBAAZk")

## 1.1 Load Historical Yield Data (Training Targets)

Note that the yield data is loaded from local storage. The Greenhub SDK will soon provide a method `get_yield_data()` to fetch the yield data on our Firebase Storage.

In [15]:
# Wheat Yield Data
dfs = []
for root, dirs, files in os.walk('./historical-yield-data/AR/targets/states'):
    for file in tqdm(files):
        if file.endswith('yield_states_normalized.csv'):
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path)
            year = df['Year']
            dfs.append(df)

wheat_df = pd.concat(dfs, ignore_index=True)
wheat_df = wheat_df[['Year', 'State', 'Value']]
wheat_df['Year'] = pd.to_numeric(wheat_df['Year'])
wheat_df

ValueError: No objects to concatenate

## 1.2 Load Feature Data

### VI Data

In [16]:
# Fetch VI data
vi_df = gh.get_vi_data(country='AR', start_year=2010, end_year=2023, spatial_resolution='state')
vi_df

Loading VI data:   0%|          | 0/168 [00:00<?, ?items/s]

Unnamed: 0,FPAR,EVI,NDVI,ENGTYPE_1,CountryCode,NL_NAME_1,HASC_1,ISO_1,CC_1,GID_1,VARNAME_1,TYPE_1,State,Year,Month
0,0.460023,0.350348,0.522568,Province,AR,,AR.CT,AR-K,,ARG.2_1,,Provincia,CATAMARCA,2010,1
1,0.561885,0.400193,0.602391,Province,AR,,AR.CH,AR-U,,ARG.4_1,,Provincia,CHUBUT,2010,1
2,0.606848,0.426762,0.689785,Province,AR,,AR.CN,AR-W,,ARG.7_1,,Provincia,CORRIENTES,2010,1
3,0.516521,0.404217,0.618911,Province,AR,,AR.FM,AR-P,,ARG.9_1,,Provincia,FORMOSA,2010,1
4,0.544842,0.447910,0.604574,Province,AR,,AR.JY,AR-Y,,ARG.10_1,,Provincia,JUJUY,2010,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4027,0.399891,0.295253,0.473935,Province,AR,,AR.BA,AR-B,,ARG.1_1,Baires|Buenos Ayres,Provincia,BUENOS AIRES,2023,12
4028,0.366476,0.258074,0.415640,Province,AR,,AR.LP,AR-L,,ARG.11_1,El Pampa|Eva Perón,Provincia,LA PAMPA,2023,12
4029,0.348616,0.321998,0.482641,Province,AR,,AR.CC,AR-H,,ARG.3_1,El Chaco|Presidente Juan Peron,Provincia,CHACO,2023,12
4030,,,,Province,AR,,AR.TF,AR-V,,ARG.23_1,Feuerland|Terra del Fuoco|Terre,Provincia,TIERRA DEL FUEGO,2023,12


In [17]:
# Row selection, renaming etc.
vi_df = vi_df[['EVI', 'NDVI', 'State', 'Year', 'Month']]
vi_df['Year'] = pd.to_numeric(vi_df['Year'])
vi_df[vi_df['State'] == 'SANTA FE']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vi_df['Year'] = pd.to_numeric(vi_df['Year'])


Unnamed: 0,EVI,NDVI,State,Year,Month
17,0.591439,0.754725,SANTA FE,2010,1
41,0.612505,0.792523,SANTA FE,2010,2
65,0.459085,0.673373,SANTA FE,2010,3
89,0.295464,0.466863,SANTA FE,2010,4
113,0.249692,0.434951,SANTA FE,2010,5
...,...,...,...,...,...
3929,0.291043,0.462785,SANTA FE,2023,8
3953,0.292757,0.443480,SANTA FE,2023,9
3977,0.254393,0.380690,SANTA FE,2023,10
4001,0.258458,0.409775,SANTA FE,2023,11


### Soil Data

In [18]:
req = {
    'VI': ['EVI', 'NDVI'],
    'Soil': ['D1_mean'],
}

gh.get_feature_vec(req, spatial_resolution='state', country='AR', start_year=2010, end_year=2023)

AttributeError: module 'greenhub' has no attribute 'get_feature_vec'

In [19]:
# Fetch soil data
soil_df = gh.get_soil_data(country='AR', spatial_resolution='state', layer='D1')
soil_df

Loading Soil data:   0%|          | 0/1 [00:00<?, ?items/s]

ValueError: too many values to unpack (expected 2)

In [20]:
# Row selection, renaming etc.
selected_cols = [col for col in soil_df.columns if (col.endswith('_avg') and not col.startswith('TP-')) or col in ['Layer', 'NAME_1']]
soil_df = soil_df[selected_cols]
soil_df.columns = soil_df.columns.str.replace('_avg', '')
soil_df.rename({'NAME_1': 'State'}, axis=1, inplace=True)
soil_df['State'] = soil_df['State'].apply(lambda x: x.upper())
soil_df

NameError: name 'soil_df' is not defined

soil_data['State'] = soil_data['State'].apply(lambda x: x.upper())### Climate Data (Temp, Prec, Solar)

In [21]:
# Fetch climate data
climate_df = gh.get_climate_data(country='AR', start_year=2010, end_year=2023, spatial_resolution='state')
climate_df

Loading Climate data:   0%|          | 0/168 [00:00<?, ?items/s]

ValueError: too many values to unpack (expected 2)

In [22]:
# Row selection, renaming etc.
climate_df = climate_df.drop(columns=['Unnamed: 0', 'Country'])
climate_df['Year'] = pd.to_numeric(climate_df['Year'])
climate_df

NameError: name 'climate_df' is not defined

### Merge all features

In [23]:
# Merge
merged_df = pd.merge(wheat_df, vi_df, on=['Year', 'State'], how='left')
merged_df = pd.merge(merged_df, soil_df, on='State', how='left')
merged_df = pd.merge(merged_df, climate_df, on=['Year', 'Month', 'State'], how='left')
merged_df = merged_df.dropna()
merged_df

NameError: name 'wheat_df' is not defined

In [24]:
# Create a pivot table
pivot_df = merged_df.pivot_table(index=['Year', 'State'], columns='Month', aggfunc='first')

# Flatten the MultiIndex columns
pivot_df.columns = ['{}_Month{}'.format(col[0], int(col[1])) for col in pivot_df.columns]

# Reset index to turn MultiIndex into columns
pivot_df.reset_index(inplace=True)

# Fix 12x columns for static values
squash_cols = ['Drain', 'CFRAG', 'SDTO', 'STPC', 'CLPC', 'BULK', 'TAWC','CECS', 'BSAT', 'ESP',
               'CECc', 'PHAQ', 'TCEQ', 'GYPS', 'ELCO', 'ORGC', 'TOTN', 'CNrt', 'ECEC', 'ALSA', 'Value']
pivot_df = pivot_df.rename({f'{squash}_Month1': f'{squash}' for squash in squash_cols}, axis=1)
pivot_df = pivot_df.drop(columns=[col for col in pivot_df.columns if any(col.startswith(f'{squash}_Month') for squash in squash_cols)])

pivot_df

NameError: name 'merged_df' is not defined

## Limit the feature matrix to Santa Fe

In [25]:
single_state_df = pivot_df[pivot_df['State'] == 'SANTA FE']
single_state_df.head()

NameError: name 'pivot_df' is not defined

In [26]:
print(single_state_df.shape)
print("Dropping na...")
single_state_df = single_state_df.dropna()
print(single_state_df.shape)

NameError: name 'single_state_df' is not defined

# 2 Training

## 2.1 Fit an LR model

In [27]:
# Prepare the data
df = single_state_df

# Splitting the data into features and target
X = df.drop(columns=['Year', 'State', 'Value'])
y = df['Value']
years = df['Year']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test, years_train, years_test = train_test_split(X, y, years, test_size=0.2, random_state=42)

NameError: name 'single_state_df' is not defined

In [28]:
# Fit LR model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [29]:
# Save the model as pickle file
with open('linear_regression.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

In [30]:
# Load the pickle file (for testing purposes)
with open('linear_regression.pkl', 'rb') as f:
    lr_model = pickle.load(f)

## 2.4 Inspect the LR model

In [31]:
# Predicting with Linear Regression
lr_predictions = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_predictions)
print(f'Linear Regression Test MSE: {lr_mse}')

# Compare predictions
print(f'Linear Regression Predictions: {lr_predictions[:5]}')

# Calculate metrics for Linear Regression
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)

# Print the metrics
print(f'Linear Regression - MSE: {lr_mse}, R²: {lr_r2}, MAE: {lr_mae}')

# Plotting the predictions
plt.figure(figsize=(14, 6))

# Plot for Linear Regression
plt.subplot(1, 2, 1)
plt.scatter(y_test, lr_predictions, alpha=0.7, label='Predictions')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Linear Regression Predictions')
plt.legend()

plt.tight_layout()
plt.show()

NameError: name 'X_test' is not defined