# Example Model Creation

The following Jupyter Notebook is a very simple example of how to fetch data using the Greenhub SDK, train a yield prediction model, and then save it. This saved yield model can later be used in a run method to upload the model to Greenhub.ai.

The model created is a "end of Season" model, which is executed only once a year in june, 
only using climate data from may for predicting the yield. 

[greenhub.ai](https://greenhub.ai)
[docs](https://docs.greenhub.ai)

In [99]:
import greenhub as gh
import pandas as pd
import json

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 1. Fetch Data

In [100]:
# initialize greenhub sdk
gh.initialize("[GREENHUB_API_KEY]")  # todo - add greenhub api key

Using api key auth


## 1.1 Load Historical Yield Data (Training Targets)

In [101]:
# get historical yield data for MISSOURI
yield_dfs: [pd.DataFrame] = []
for y in range(2001, 2021):
    yield_dfs.append(
        gh.get_historical_yield_data(
            crop="Wheat-winter",
            year=y,
            country='US',
            spatial_resolution='state'
        )
    )
yield_data = pd.concat(yield_dfs, axis=0, ignore_index=True)
yield_data = yield_data[yield_data['State'] == 'MISSOURI']
yield_data

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Loading Historical Yield data:   0%|          | 0/1 [00:00<?, ?items/s]

Unnamed: 0,ValueType,Year,Crop,SpatialResolution,Country,State,Municipality,Value
21,Yield,2001,Wheat-winter,State,US,MISSOURI,,3.631473
66,Yield,2002,Wheat-winter,State,US,MISSOURI,,2.958978
95,Yield,2003,Wheat-winter,State,US,MISSOURI,,4.10222
143,Yield,2004,Wheat-winter,State,US,MISSOURI,,3.496974
197,Yield,2005,Wheat-winter,State,US,MISSOURI,,3.564224
214,Yield,2006,Wheat-winter,State,US,MISSOURI,,3.631473
268,Yield,2007,Wheat-winter,State,US,MISSOURI,,2.891729
300,Yield,2008,Wheat-winter,State,US,MISSOURI,,3.227976
354,Yield,2009,Wheat-winter,State,US,MISSOURI,,3.160727
388,Yield,2010,Wheat-winter,State,US,MISSOURI,,3.093477


## 1.2 Load Feature Data

In [102]:
# get climate data for MISSOURI from 2010 to 2020
climate_data = gh.get_climate_data(
    country='US',
    start_year=2001,
    end_year=2020,
    spatial_resolution='state',
    time_resolution='monthly'
)
# we only consider past climate data for MISSOURI
climate_data = climate_data[climate_data['State'] == 'MISSOURI']
# we only consider past climate data for May 
climate_data = climate_data[(climate_data['Month'] == 5)]
climate_data

Loading Climate data:   0%|          | 0/240 [00:00<?, ?items/s]

Unnamed: 0,Year,Month,State,CountryCode,Mean Min Temperature [K],Mean Max Temperature [K],Mean Temperature [K],Total Precipitation [mm],Temperature*Precipitation,Temperature^2,Precipitation^2,Min Precipitation [mm],Max Precipitation [mm],Mean Precipitation [mm]
229,2001,5,MISSOURI,US,284.662582,300.878114,292.263144,151.964422,1431.524499,85429.525869,71.203982,0.006218,32.215533,4.902078
841,2002,5,MISSOURI,US,282.070304,299.356973,290.07861,214.405235,2013.966248,84157.815308,118.618068,4.5e-05,28.872253,6.916298
1453,2003,5,MISSOURI,US,282.263316,300.207909,291.02221,160.309631,1505.018808,84699.263643,63.014228,2.4e-05,25.6468,5.171278
2065,2004,5,MISSOURI,US,284.961222,301.150538,293.118775,139.540869,1316.629529,85939.249018,46.459441,0.0,16.844388,4.501318
2677,2005,5,MISSOURI,US,282.234414,301.1601,291.522554,50.90514,481.993457,85004.133967,11.284445,-2.2e-05,13.351218,1.642101
3289,2006,5,MISSOURI,US,284.260853,301.36435,292.041527,86.42521,814.606984,85311.259123,18.178048,0.029726,13.962424,2.78791
3901,2007,5,MISSOURI,US,285.849971,302.204484,293.88791,105.564446,1000.572259,86375.741807,24.304264,0.0,12.086929,3.405305
4513,2008,5,MISSOURI,US,282.42422,300.018799,290.820751,148.331123,1391.68529,84587.039631,57.157087,-2.2e-05,18.396339,4.784875
5125,2009,5,MISSOURI,US,283.549226,300.06698,291.46137,134.825296,1268.749725,84958.836121,45.513927,0.0,18.143862,4.349203
5737,2010,5,MISSOURI,US,284.149137,300.806467,291.87493,162.12363,1517.256067,85209.372264,69.16461,0.001666,26.293345,5.229795


# 2. Create training dataset

In [103]:
# we append the climate dataframe with the corresponding historical yields
training_data = pd.merge(climate_data, yield_data[['Year', 'Value']], on="Year", how="inner")
training_data

Unnamed: 0,Year,Month,State,CountryCode,Mean Min Temperature [K],Mean Max Temperature [K],Mean Temperature [K],Total Precipitation [mm],Temperature*Precipitation,Temperature^2,Precipitation^2,Min Precipitation [mm],Max Precipitation [mm],Mean Precipitation [mm],Value
0,2001,5,MISSOURI,US,284.662582,300.878114,292.263144,151.964422,1431.524499,85429.525869,71.203982,0.006218,32.215533,4.902078,3.631473
1,2002,5,MISSOURI,US,282.070304,299.356973,290.07861,214.405235,2013.966248,84157.815308,118.618068,4.5e-05,28.872253,6.916298,2.958978
2,2003,5,MISSOURI,US,282.263316,300.207909,291.02221,160.309631,1505.018808,84699.263643,63.014228,2.4e-05,25.6468,5.171278,4.10222
3,2004,5,MISSOURI,US,284.961222,301.150538,293.118775,139.540869,1316.629529,85939.249018,46.459441,0.0,16.844388,4.501318,3.496974
4,2005,5,MISSOURI,US,282.234414,301.1601,291.522554,50.90514,481.993457,85004.133967,11.284445,-2.2e-05,13.351218,1.642101,3.564224
5,2006,5,MISSOURI,US,284.260853,301.36435,292.041527,86.42521,814.606984,85311.259123,18.178048,0.029726,13.962424,2.78791,3.631473
6,2007,5,MISSOURI,US,285.849971,302.204484,293.88791,105.564446,1000.572259,86375.741807,24.304264,0.0,12.086929,3.405305,2.891729
7,2008,5,MISSOURI,US,282.42422,300.018799,290.820751,148.331123,1391.68529,84587.039631,57.157087,-2.2e-05,18.396339,4.784875,3.227976
8,2009,5,MISSOURI,US,283.549226,300.06698,291.46137,134.825296,1268.749725,84958.836121,45.513927,0.0,18.143862,4.349203,3.160727
9,2010,5,MISSOURI,US,284.149137,300.806467,291.87493,162.12363,1517.256067,85209.372264,69.16461,0.001666,26.293345,5.229795,3.093477


In [104]:
# splitting data into X (features) and y (target)
y = training_data[["Value"]]
X = training_data[[
    'Mean Min Temperature [K]', 
    'Mean Max Temperature [K]',
    'Mean Temperature [K]', 
    'Total Precipitation [mm]',
    'Temperature*Precipitation', 
    'Temperature^2', 
    'Precipitation^2',
    'Min Precipitation [mm]', 
    'Max Precipitation [mm]',
    'Mean Precipitation [mm]'
]]

In [105]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=8)
X_train

Unnamed: 0,Mean Min Temperature [K],Mean Max Temperature [K],Mean Temperature [K],Total Precipitation [mm],Temperature*Precipitation,Temperature^2,Precipitation^2,Min Precipitation [mm],Max Precipitation [mm],Mean Precipitation [mm]
6,285.849971,302.204484,293.88791,105.564446,1000.572259,86375.741807,24.304264,0.0,12.086929,3.405305
15,283.436395,299.653635,290.900491,129.304133,1215.718091,84637.367156,41.2474,0.0,18.016967,4.171101
18,283.452453,300.883186,291.640119,246.025258,2316.646286,85072.015759,115.040072,0.03142,27.917236,7.936299
4,282.234414,301.1601,291.522554,50.90514,481.993457,85004.133967,11.284445,-2.2e-05,13.351218,1.642101
12,283.566986,300.233046,291.059554,159.826668,1494.565638,84743.726532,65.435132,0.0,27.595642,5.155699
2,282.263316,300.207909,291.02221,160.309631,1505.018808,84699.263643,63.014228,2.4e-05,25.6468,5.171278
13,284.116032,301.791731,292.362245,88.141943,831.89829,85502.164259,21.729171,0.000703,13.647958,2.843288
1,282.070304,299.356973,290.07861,214.405235,2013.966248,84157.815308,118.618068,4.5e-05,28.872253,6.916298
16,283.306069,300.877579,291.76678,149.325799,1399.384516,85142.538363,79.205925,0.000132,32.308936,4.816961
0,284.662582,300.878114,292.263144,151.964422,1431.524499,85429.525869,71.203982,0.006218,32.215533,4.902078


# 3. Train model

In [106]:
# create and train a basic linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# predict on test set
y_pred = model.predict(X_test)

# evaluate
mse = mean_squared_error(y_test, y_pred, multioutput="raw_values")
print("Mean Squared Error:", mse)

Mean Squared Error: [0.96778055]


# 4. Save model

In [108]:
model_params = {
    "coef": model.coef_.tolist(),  
    "intercept": model.intercept_.tolist()
}

with open("./model.json", "w") as f:
    json.dump(model_params, f)