In [0]:
!wget https://storage.googleapis.com/summer_school/feats_labels.zip

In [0]:
!unzip feats_labels.zip

In [0]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.dates as md
import matplotlib.pyplot as plt
import sklearn

%matplotlib inline

In [0]:
# read targets
airglow = pd.read_csv('feats_labels/labels_airglow_prediction.csv').set_index('utc557')
airglow

In [0]:
# read features
feats = pd.read_csv('feats_labels/features_airglow_prediction.csv').set_index('date')
feats

In [0]:
# join tables together based on date (inner join)
df = feats.merge(airglow, left_index=True, right_index=True, how='inner')
df

### Task 1: Which feature has the greatest sigma / mean ratio?

In [0]:
# prechadzaj cez df.columns (mimo poslednych troch)
# pre kazdy column zrataj mean, std
# uloz si napr. do dictionary key-value pairs, column_name: (std / mean)
# pre ktory column je maximalny (std / mean)?

In [0]:
# calculate correlation matrix (just to see the potential)
df.corr().iloc[-3:, :-3]

In [0]:
# separate features and labels
X = df.iloc[:, :-3].values
y1, y2, y3 = df.iloc[:, -3].values, df.iloc[:, -2].values, df.iloc[:, -1].values
print(X.shape)
print(y1.shape, y2.shape, y3.shape)

### Look for outliers

In [0]:
# requires domain knowledge to do it right, here are probably no outliers
plt.figure()
plt.hist(y2, bins=50)
plt.show()

### Useful functions

In [0]:
# train/test, feature scaling and linear regression - look at the code and make sure you understand it!
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

def split_and_scale(X, y, poly_feats=False, poly_degree=2, random_state=42, shuffle=True):
    if poly_feats:
        poly_trans = PolynomialFeatures(degree=poly_degree, include_bias=False)
        X = poly_trans.fit_transform(X)
    print('Number of features: ', X.shape[1])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                        random_state=random_state, shuffle=shuffle)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_sc = scaler.transform(X_train)
    X_test_sc = scaler.transform(X_test)
    return X_train_sc, X_test_sc, y_train, y_test

def fit_eval_regression_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    print('Train score:', model.score(X_train, y_train), ' / ', mean_absolute_error(y_train, y_train_pred))
    print('Test score:', model.score(X_test, y_test), ' / ', mean_absolute_error(y_test, y_test_pred))
    print('Baseline MAE:', np.mean(np.abs(y_test - np.mean(y_train))) )

### Linear regression

In [0]:
from sklearn.linear_model import LinearRegression

for y in [y1, y2, y3]:
    X_train, X_test, y_train, y_test = split_and_scale(X, y, poly_feats=False, random_state=42, shuffle=True)    
    fit_eval_regression_model(X_train, X_test, y_train, y_test, LinearRegression())
    print('------------------------')

### Linear regression with polynomial features (degree 2)

In [0]:
for y in [y1, y2, y3]:
    X_train, X_test, y_train, y_test = split_and_scale(X, y, poly_feats=True, poly_degree=2, shuffle=True)
    fit_eval_regression_model(X_train, X_test, y_train, y_test, LinearRegression())
    print('------------------------')

### Linear regression with polynomial features (degree 3)

In [0]:
for y in [y1, y2, y3]:
    X_train, X_test, y_train, y_test = split_and_scale(X, y, poly_feats=True, poly_degree=3, shuffle=True)
    fit_eval_regression_model(X_train, X_test, y_train, y_test, LinearRegression())
    print('------------------------')

### Add regularization

### Task 2: Add regularization (model called Ridge), do it for polynomial orders 1-5 and only for i630 line

In [0]:
# for loop over polynomial order
# instead of LinearRegression, use Ridge
# tune parameter alpha (regularization strength)

### Neural networks

In [0]:
# Time for gradient descent

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler

model1 = Sequential([
    Dense(32, input_shape=(13,)),
    Activation('relu'),
    Dense(1)
])

model1.summary()

In [0]:
X_train, X_test, y_train, y_test = split_and_scale(X, y2, poly_feats=False)

optimizer = Adam(lr=1e-2)
lr_scheduler = LearningRateScheduler(schedule=lambda epoch, lr: lr*0.9 if epoch % 100 == 0 else lr, verbose=1)

model1.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
history = model1.fit(X_train, y_train, epochs=10000, batch_size=64,
                     validation_data=(X_test, y_test),
                     callbacks=[lr_scheduler])

In [0]:
def plot_nn_results(history):
    plt.figure()
    plt.plot(history.history['mean_absolute_error'])
    plt.plot(history.history['val_mean_absolute_error'])
    plt.ylabel('mean_absolute_error')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper right')
    plt.ylim(50, 80)
    plt.show()

    plt.figure()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.ylabel('Loss')
    plt.xlabel('epoch')
    plt.ylim(5000, 20000)
    plt.legend(['train', 'test'], loc='upper right')
    plt.show()
    
plot_nn_results(history)

In [0]:
model2 = Sequential([
    Dense(32, input_shape=(13,)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.2),
    Dense(32),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.1),
    Dense(1)
])

model2.summary()

In [0]:
X_train, X_test, y_train, y_test = split_and_scale(X, y2, poly_feats=False)

optimizer = Adam(lr=1e-2)
lr_scheduler = LearningRateScheduler(schedule=lambda epoch, lr: lr*0.9 if epoch % 100 == 0 else lr, verbose=1)

model2.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
history = model2.fit(X_train, y_train, epochs=10000, batch_size=64,
                     validation_data=(X_test, y_test),
                     callbacks=[lr_scheduler])

In [0]:
plot_nn_results(history)

In [0]:
# look at predictions in test set
plt.figure()
plt.scatter(np.arange(len(y_test)), y_test, s=2)
plt.plot(model2.predict(X_test), color='r')
plt.show()

#### Pridame dake neurony

In [0]:
# tu ich nechaj ladit dropout rate, nech precitia
model3 = Sequential([
    Dense(64, input_shape=(13,)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.2),
    Dense(64),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.1),
    Dense(1)
])

model3.summary()

In [0]:
X_train, X_test, y_train, y_test = split_and_scale(X, y2, poly_feats=False)

optimizer = Adam(lr=1e-2)
lr_scheduler = LearningRateScheduler(schedule=lambda epoch, lr: lr*0.9 if epoch % 100 == 0 else lr, verbose=1)

model3.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
history = model3.fit(X_train, y_train, epochs=1000, batch_size=64,
                     validation_data=(X_test, y_test),
                     callbacks=[lr_scheduler])

In [0]:
plot_nn_results(history)

### Random Forest

In [0]:
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = split_and_scale(X, y2, poly_feats=False)

rf = RandomForestRegressor(n_estimators=100, random_state=883, n_jobs=-1, max_depth=None)
rf.fit(X_train, y_train)
rf_train_pred = rf.predict(X_train)
rf_test_pred = rf.predict(X_test)

print('Train score:', rf.score(X_train, y_train), ' / ', mean_absolute_error(y_train, rf_train_pred))
print('Test score:', rf.score(X_test, y_test), ' / ', mean_absolute_error(y_test, rf_test_pred))

### XGBoost regressor

In [0]:
# nech si skusaju ladit regularizaciu - cez max_depth a learning_rate
import xgboost as xgb

X_train, X_test, y_train, y_test = split_and_scale(X, y2, poly_feats=False)

# data_dmatrix = xgb.DMatrix(data=X,label=y2)
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', 
                          colsample_bytree=1, 
                          learning_rate=0.3, 
                          gamma=10, 
                          n_estimators=100,
                          max_depth=8)

xg_reg.fit(X_train, y_train)
y_test_pred = xg_reg.predict(X_test)
y_train_pred = xg_reg.predict(X_train)
print('Train score:', xg_reg.score(X_train, y_train), ' / ', mean_absolute_error(y_train, y_train_pred))
print('Test score:', xg_reg.score(X_test, y_test), ' / ', mean_absolute_error(y_test, y_test_pred))

## Split train/test on date - detecting data leakage

### Linear regression

In [0]:
for shuffle_par in [True, False]:
    print('SHUFFLE:', shuffle_par)
    for y in [y1, y2, y3]:
        X_train, X_test, y_train, y_test = split_and_scale(X, y, poly_feats=False, 
                                                           random_state=42, shuffle=shuffle_par)    
        fit_eval_regression_model(X_train, X_test, y_train, y_test, LinearRegression())
        print('------------------------')

### Task 3a: Perform linear regression with polynomial features (degree 2) with shuffle=False for all 3 lines

### Task 3b: Perform linear regression with polynomial features (degree 3) with shuffle=False for all 3 lines

### Task 3c: Add regularization - do it for polynomial orders 1-5 and only for i630 line

### Task 3d: Random Forest Regressor - the same as before, but shuffle=False

### Task 3e: XGBoost regressor - the same as before, but shuffle=False. Also tune learning_rate and max_depth

### Feature importance

In [0]:
sorted(zip(df.columns[:-3], xg_reg.feature_importances_), key=lambda x: -x[1])
# zaujimave, ze "O", ktore nema ziadnu korelaciu s y2, tak ma celkom vysoku feature importance!

In [0]:
plt.figure()
plt.scatter(df['O'], df['i630'], s=1)
plt.show()

In [0]:
"""
skuste si este zvysne 2 ciary nafitovat, ked zvysi cas...
"""

## Time series forecasting

In [0]:
y = feats['Kp'].iloc[:2160]
plt.figure(figsize=(10, 4))
y.plot()
plt.show()

In [0]:
# check seasonality
from statsmodels.tsa.stattools import periodogram

plt.figure()
plt.bar(range(100), periodogram(y)[:100])
plt.title('Periodogram')
plt.show()

In [0]:
# check trends and seasonality
import statsmodels.api as sm

decomposition = sm.tsa.seasonal_decompose(y, model='additive', freq=24)
fig = decomposition.plot()
plt.show()

In [0]:
# determine lags from ACF and PACF
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(y, lags=100)
plt.title('Kp - Autocorrelation')
plt.show()

plot_pacf(y, lags=100)
plt.title('Kp - Partial autocorrelation')
plt.show()

In [0]:
#train_test_split
tr_start,tr_end = y.index[0], y.index[int(0.9*len(y))]
te_start,te_end = y.index[int(0.9*len(y))], y.index[-1]
tra = y[tr_start:tr_end]
tes = y[te_start:te_end]

In [0]:
te_end - tr_end

In [0]:
# test for stationarity of the series
res = sm.tsa.adfuller(y, regression='ct')
print('p-value:{}'.format(res[1]))

In [0]:
# fit model

arima = sm.tsa.statespace.SARIMAX(tra,
                                order=[4, 1, 4],
                                seasonal_order=[0, 0, 0, 0],
                                enforce_stationarity=False,
                                enforce_invertibility=False).fit()
arima.aic

### Task 4: Do the grid search over the parameters p, d, q. Find the set with best AIC.

In [0]:
# use the best model
arima = sm.tsa.statespace.SARIMAX(tra,
                                order=[15, 1, 17],
                                seasonal_order=[0, 0, 0, 0],
                                enforce_stationarity=False,
                                enforce_invertibility=False).fit()


In [0]:
# look at residuals - is there any correlation still left?
res = arima.resid
fig,ax = plt.subplots(2,1,figsize=(15,8))
fig = sm.graphics.tsa.plot_acf(res, lags=50, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(res, lags=50, ax=ax[1])
plt.show()

In [0]:
# calculate prediction error
from sklearn.metrics import mean_squared_error
pred = arima.predict(tr_end, te_end)
print('ARIMA model MSE:{}'.format(mean_squared_error(tes, pred)))

In [0]:
# compare with baseline - disappointment!
print('BASELINE:', ((tes - np.mean(tra))**2).mean())

In [0]:
# wait a second, let's visualize it!
pd.DataFrame({'test':tes,'pred':pred}).plot()
plt.show()

In [0]:
# the prediction is actually pretty when not going that far to the future
print('ARIMA model MSE:{}'.format(mean_squared_error(tes[:8], pred[:8])))
print('Baseline1:', ((tes[:8] - np.mean(tra))**2).mean())
print('Baseline2:', ((tes[:8] - tra[-8:].mean())**2).mean())

In [0]:
"""
What we can add:
- seasonality
- check the convergence (warnings...)
- cross-validation
- exogenous parameters (other weather indices...)

Skuste este pre ine casove rady urobit tuto analyzu
"""