In [21]:
import greenhub as gh
import pandas as pd
import pickle

# 1. Fetch Data and Create Feature Vector

## Setup

In [22]:
# Our model will predict for the state SANTA FE in AR 
COUNTRY = 'AR'
STATE = 'SANTA FE'
SPATIAL_RESOLUTION = 'state'
START_YEAR = 2010

In [23]:
# Initialize greenhub sdk
gh.initialize("-")

## VI Data

In [24]:
# Fetch VI data
vi_df = gh.get_vi_data(country=COUNTRY, start_year=START_YEAR, spatial_resolution=SPATIAL_RESOLUTION)

# Row selection, renaming etc.
vi_df = vi_df[['EVI', 'NDVI', 'State', 'Year', 'Month']]
vi_df['Year'] = pd.to_numeric(vi_df['Year'])
vi_df[vi_df['State'] == STATE]

## Soil Data

In [25]:
# Fetch soil data
soil_df = gh.get_soil_data(country=COUNTRY, spatial_resolution=SPATIAL_RESOLUTION, layer='D1')

# Row selection, renaming etc.
selected_cols = [col for col in soil_df.columns if (col.endswith('_avg') and not col.startswith('TP-')) or col in ['Layer', 'NAME_1']]
soil_df = soil_df[selected_cols]
soil_df.columns = soil_df.columns.str.replace('_avg', '')
soil_df.rename({'NAME_1': 'State'}, axis=1, inplace=True)
soil_df['State'] = soil_df['State'].apply(lambda x: x.upper())
soil_df

## Climate Data

In [26]:
# Fetch climate data
climate_df = gh.get_climate_data(country='AR', start_year=2010, end_year=2023, spatial_resolution='state')

# Row selection, renaming etc.
climate_df = climate_df.drop(columns=['CountryCode'])
climate_df['Year'] = pd.to_numeric(climate_df['Year'])
climate_df

## Merge all features

In [27]:
# Merge
merged_df = pd.merge(vi_df, soil_df, on='State', how='left')
merged_df = pd.merge(merged_df, climate_df, on=['Year', 'Month', 'State'], how='left')
merged_df = merged_df.dropna()

# Create a pivot table
pivot_df = merged_df.pivot_table(index=['Year', 'State'], columns='Month', aggfunc='first')

# Flatten the MultiIndex columns
pivot_df.columns = ['{}_Month{}'.format(col[0], int(col[1])) for col in pivot_df.columns]

# Reset index to turn MultiIndex into columns
pivot_df.reset_index(inplace=True)

# Fix 12x columns for static values
squash_cols = ['Drain', 'CFRAG', 'SDTO', 'STPC', 'CLPC', 'BULK', 'TAWC','CECS', 'BSAT', 'ESP',
               'CECc', 'PHAQ', 'TCEQ', 'GYPS', 'ELCO', 'ORGC', 'TOTN', 'CNrt', 'ECEC', 'ALSA', 'Value']
pivot_df = pivot_df.rename({f'{squash}_Month1': f'{squash}' for squash in squash_cols}, axis=1)
pivot_df = pivot_df.drop(columns=[col for col in pivot_df.columns if any(col.startswith(f'{squash}_Month') for squash in squash_cols)])

pivot_df

## Limit the feature matrix to Santa Fe

In [28]:
single_state_df = pivot_df[pivot_df['State'] == 'SANTA FE']
single_state_df.head()

## Final Feature Vector

In [29]:
feature_vector = single_state_df.drop(columns=['Year', 'State'])
feature_vector = feature_vector.dropna()
feature_vector

# 2 Load LR Model

In [30]:
# Load the pickle file of the LR model
with open('linear_regression.pkl', 'rb') as f:
    lr_model = pickle.load(f)

# 3 Run LR Model

In [31]:
# Pass the feature vector to the model and make predictions
prediction = lr_model.predict(feature_vector)
print(prediction)