```yaml
titan: v1
service:
  image: scipy
  machine:
    cpu: 2
    memory: 2048MB
```

In [1]:
#importing dependencies
from sklearn import datasets
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
import pandas as pd
import json

In [2]:
#load data
url = "https://storage.googleapis.com/tutorial-datasets/weather_data_GER_2016.csv"
weather = pd.read_csv(url)

In [3]:
weather.head()

Unnamed: 0,timestamp,cumulated hours,lat,lon,v1,v2,v_50m,h1,h2,z0,SWTDN,SWGDN,T,rho,p
0,2016-01-01T00:00:00Z,0,47.5,5.625,0.81,1.88,3.36,2,10,0.052526,0.0,0.0,277.350159,1.236413,99282.710938
1,2016-01-01T01:00:00Z,1,47.5,5.625,0.77,1.61,2.63,2,10,0.05251,0.0,0.0,277.025665,1.23939,99300.164062
2,2016-01-01T02:00:00Z,2,47.5,5.625,0.66,1.22,1.89,2,10,0.052495,0.0,0.0,277.223755,1.243861,99310.992188
3,2016-01-01T03:00:00Z,3,47.5,5.625,0.96,1.35,1.62,2,10,0.05248,0.0,0.0,277.13324,1.24739,99314.773438
4,2016-01-01T04:00:00Z,4,47.5,5.625,1.14,1.56,1.83,2,10,0.05248,0.0,0.0,276.867767,1.248869,99324.796875


The data in the file contains the following:
wind
    v1: velocity [m/s] @ height h1 (2 meters above displacement height)
    v2: velocity [m/s] @ height h2 (10 meters above displacement height)
    v_50m: velocity [m/s] @ 50 meters above ground
    h1: height above ground [m] (h1 = displacement height +2m)
    h2: height above ground [m] (h2 = displacement height +10m)
    z0: roughness length [m]
solar parameters:
    SWTDN: total top-of-the-atmosphere horizontal radiation [W/m²]
    SWGDN: total ground horizontal radiation [W/m²]
temperature data
    T: Temperature [K] @ 2 meters above displacement height (see h1)
air data
    Rho: air density [kg/m³] @ surface *p: air pressure [Pa] @ surface

In [4]:
# Reading the dataset from a Gitlab repo
url = "https://storage.googleapis.com/tutorial-datasets/time_series_60min_singleindex_filtered.csv"
production = pd.read_csv(url)

In [5]:
production.head()

Unnamed: 0,utc_timestamp,cet_cest_timestamp,DE_wind_generation_actual
0,2015-12-31T23:00:00Z,2016-01-01T00:00:00+0100,8638
1,2016-01-01T00:00:00Z,2016-01-01T01:00:00+0100,8579
2,2016-01-01T01:00:00Z,2016-01-01T02:00:00+0100,8542
3,2016-01-01T02:00:00Z,2016-01-01T03:00:00+0100,8443
4,2016-01-01T03:00:00Z,2016-01-01T04:00:00+0100,8295


In [6]:
# Merge datasets
weather_by_day = weather.groupby(weather.index).mean()
combined = pd.merge(production,
                    weather_by_day,
                    how='left',
                    left_index=True,
                    right_index=True)

In [7]:
combined.head()

Unnamed: 0,utc_timestamp,cet_cest_timestamp,DE_wind_generation_actual,cumulated hours,lat,lon,v1,v2,v_50m,h1,h2,z0,SWTDN,SWGDN,T,rho,p
0,2015-12-31T23:00:00Z,2016-01-01T00:00:00+0100,8638,0,47.5,5.625,0.81,1.88,3.36,2,10,0.052526,0.0,0.0,277.350159,1.236413,99282.710938
1,2016-01-01T00:00:00Z,2016-01-01T01:00:00+0100,8579,1,47.5,5.625,0.77,1.61,2.63,2,10,0.05251,0.0,0.0,277.025665,1.23939,99300.164062
2,2016-01-01T01:00:00Z,2016-01-01T02:00:00+0100,8542,2,47.5,5.625,0.66,1.22,1.89,2,10,0.052495,0.0,0.0,277.223755,1.243861,99310.992188
3,2016-01-01T02:00:00Z,2016-01-01T03:00:00+0100,8443,3,47.5,5.625,0.96,1.35,1.62,2,10,0.05248,0.0,0.0,277.13324,1.24739,99314.773438
4,2016-01-01T03:00:00Z,2016-01-01T04:00:00+0100,8295,4,47.5,5.625,1.14,1.56,1.83,2,10,0.05248,0.0,0.0,276.867767,1.248869,99324.796875


In [10]:
# specifying the features
X = combined[['v1', 'v2', 'v_50m', 'z0']]
y = combined['DE_wind_generation_actual']

In [13]:
#create a train-test split, 80-20% split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#shape of the feature sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7027, 4), (1757, 4), (7027,), (1757,))

In [15]:
model = SGDRegressor()
model.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [27]:
pred = model.predict(X_test)

mse = (pred - y_test)**2

mse

1441    1.875134e+07
7874    3.904032e+06
5284    3.660172e+07
380     4.071026e+06
2654    1.019077e+07
            ...     
823     1.047693e+08
31      4.536554e+07
3975    3.063684e+07
4887    3.157172e+07
5218    8.792059e+06
Name: DE_wind_generation_actual, Length: 1757, dtype: float64

In [29]:
# Now we can see the coefficients of our model
print(f'alpha = {model.intercept_}')
print(f'betas = {model.coef_}')

alpha = [7653.8939528]
betas = [-2968.89370723  2254.09830399    49.02040947   693.81647108]


In [30]:
# GET /alphas
print(f'alpha = {model.intercept_}')

alpha = [7653.8939528]


In [31]:
# GET /betas
print(f'betas = {model.coef_}')

betas = [-2968.89370723  2254.09830399    49.02040947   693.81647108]


In [32]:
# Mock request object for local API testing
headers = {'content-type': 'application/json'}
body = json.dumps({"data": [[1.44, 1.77, 2, 0.054]]})
REQUEST = json.dumps({'headers': headers, 'body': body})


In [33]:
# POST /prediction
body = json.loads(REQUEST)['body']
# predict the cluster for new samples. Function to be exposed through Titan
input_params = json.loads(body)['data']
#input_params = [[0.44, 1.77, 2, 0.054]]
print(model.predict(input_params))

[7503.94792084]
