# Example Model for Wheat Yield in AR, Santa Fe using Greenhub SDK
## Features: VI, Soil, Climate

In [1]:
import greenhub as gh
import pandas as pd
import pickle
import os

from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 1. Fetch Data

In [2]:
# Initialize greenhub sdk
gh.initialize("6X9oFeB88pTLdeTcg997uCodlNFP9l1D")

Using api key auth


## 1.1 Load Historical Yield Data (Training Targets)

In [6]:
# Wheat Yield Data
yield_data = gh.

## 1.2 Load Feature Data

### VI Data

In [58]:
# Fetch VI data
vi_df = gh.get_vi_data(country='AR', start_year=2010, end_year=2023, spatial_resolution='state')
vi_df

In [59]:
# Row selection, renaming etc.
vi_df = vi_df[['EVI', 'NDVI', 'State', 'Year', 'Month']]
vi_df['Year'] = pd.to_numeric(vi_df['Year'])
vi_df[vi_df['State'] == 'SANTA FE']

### Soil Data

In [None]:
req = {
    'VI': ['EVI', 'NDVI'],
    'Soil': ['D1_mean'],
}

gh.get_feature_vec(req, spatial_resolution='state', country='AR', start_year=2010, end_year=2023)

In [60]:
# Fetch soil data
soil_df = gh.get_soil_data(country='AR', spatial_resolution='state', layer='D1')
soil_df

In [61]:
# Row selection, renaming etc.
selected_cols = [col for col in soil_df.columns if (col.endswith('_avg') and not col.startswith('TP-')) or col in ['Layer', 'NAME_1']]
soil_df = soil_df[selected_cols]
soil_df.columns = soil_df.columns.str.replace('_avg', '')
soil_df.rename({'NAME_1': 'State'}, axis=1, inplace=True)
soil_df['State'] = soil_df['State'].apply(lambda x: x.upper())
soil_df

soil_data['State'] = soil_data['State'].apply(lambda x: x.upper())### Climate Data (Temp, Prec, Solar)

In [62]:
# Fetch climate data
climate_df = gh.get_climate_data(country='AR', start_year=2010, end_year=2023, spatial_resolution='state')
climate_df

In [63]:
# Row selection, renaming etc.
climate_df = climate_df.drop(columns=['Unnamed: 0', 'Country'])
climate_df['Year'] = pd.to_numeric(climate_df['Year'])
climate_df

### Merge all features

In [64]:
# Merge
merged_df = pd.merge(wheat_df, vi_df, on=['Year', 'State'], how='left')
merged_df = pd.merge(merged_df, soil_df, on='State', how='left')
merged_df = pd.merge(merged_df, climate_df, on=['Year', 'Month', 'State'], how='left')
merged_df = merged_df.dropna()
merged_df

In [65]:
# Create a pivot table
pivot_df = merged_df.pivot_table(index=['Year', 'State'], columns='Month', aggfunc='first')

# Flatten the MultiIndex columns
pivot_df.columns = ['{}_Month{}'.format(col[0], int(col[1])) for col in pivot_df.columns]

# Reset index to turn MultiIndex into columns
pivot_df.reset_index(inplace=True)

# Fix 12x columns for static values
squash_cols = ['Drain', 'CFRAG', 'SDTO', 'STPC', 'CLPC', 'BULK', 'TAWC','CECS', 'BSAT', 'ESP',
               'CECc', 'PHAQ', 'TCEQ', 'GYPS', 'ELCO', 'ORGC', 'TOTN', 'CNrt', 'ECEC', 'ALSA', 'Value']
pivot_df = pivot_df.rename({f'{squash}_Month1': f'{squash}' for squash in squash_cols}, axis=1)
pivot_df = pivot_df.drop(columns=[col for col in pivot_df.columns if any(col.startswith(f'{squash}_Month') for squash in squash_cols)])

pivot_df

## Limit the feature matrix to Santa Fe

In [66]:
single_state_df = pivot_df[pivot_df['State'] == 'SANTA FE']
single_state_df.head()

In [67]:
print(single_state_df.shape)
print("Dropping na...")
single_state_df = single_state_df.dropna()
print(single_state_df.shape)

# 2 Training

## 2.1 Fit an LR model

In [68]:
# Prepare the data
df = single_state_df

# Splitting the data into features and target
X = df.drop(columns=['Year', 'State', 'Value'])
y = df['Value']
years = df['Year']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test, years_train, years_test = train_test_split(X, y, years, test_size=0.2, random_state=42)

In [69]:
# Fit LR model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [70]:
# Save the model as pickle file
with open('linear_regression.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

In [71]:
# Load the pickle file (for testing purposes)
with open('linear_regression.pkl', 'rb') as f:
    lr_model = pickle.load(f)

## 2.4 Inspect the LR model

In [72]:
# Predicting with Linear Regression
lr_predictions = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_predictions)
print(f'Linear Regression Test MSE: {lr_mse}')

# Compare predictions
print(f'Linear Regression Predictions: {lr_predictions[:5]}')

# Calculate metrics for Linear Regression
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)

# Print the metrics
print(f'Linear Regression - MSE: {lr_mse}, R²: {lr_r2}, MAE: {lr_mae}')

# Plotting the predictions
plt.figure(figsize=(14, 6))

# Plot for Linear Regression
plt.subplot(1, 2, 1)
plt.scatter(y_test, lr_predictions, alpha=0.7, label='Predictions')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Linear Regression Predictions')
plt.legend()

plt.tight_layout()
plt.show()