### Import libaries

In [45]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf
import random
import sklearn

%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

### Clean data

In [46]:
#Read dataset
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

#add location to each sample
train_a["location"] = "A"
train_b["location"] = "B"
train_c["location"] = "C"

X_train_estimated_a["location"] = "A"
X_train_estimated_b["location"] = "B"
X_train_estimated_c["location"] = "C"

X_train_observed_a["location"] = "A"
X_train_observed_b["location"] = "B"
X_train_observed_c["location"] = "C"

#remove extra minute 00 sample
X_train_observed_a = X_train_observed_a.iloc[:-1,:]
X_train_observed_b = X_train_observed_b.iloc[:-1,:]
X_train_observed_c = X_train_observed_c.iloc[:-1,:]

#add date_calc column same as date_forecast column to observed data
X_train_observed_a.insert(0, "date_calc", X_train_observed_a["date_forecast"])
X_train_observed_b.insert(0, "date_calc", X_train_observed_b["date_forecast"])
X_train_observed_c.insert(0, "date_calc", X_train_observed_c["date_forecast"])

#concat all the samples
X_train_raw = pd.concat([X_train_observed_a,
                     X_train_observed_b,
                     X_train_observed_c,
                     X_train_estimated_a,
                     X_train_estimated_b,
                     X_train_estimated_c])

#feature indicating time between date_calc and date_forecast
X_train_raw["calc_time"] =(X_train_raw["date_forecast"] - X_train_raw["date_calc"]).astype('timedelta64[s]')

#fill nans
X_train_raw["snow_density:kgm3"] = X_train_raw["snow_density:kgm3"].apply(
    lambda a : np.isnan(a)
    ).map({True: 0, False: 1})
X_train_raw["ceiling_height_agl:m"] = X_train_raw["ceiling_height_agl:m"].apply(
    lambda a : -1000 if np.isnan(a) else a
)
X_train_raw["cloud_base_agl:m"] = X_train_raw["ceiling_height_agl:m"].apply(
    lambda a : -1000 if np.isnan(a) else a
)

#create seperate dataframes for measurments at minute 00, 15, 30 and 45
X_train00 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 0)].reset_index().iloc[:,1:]
X_train15 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 15)].reset_index().iloc[:,1:]
X_train30 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 30)].reset_index().iloc[:,1:]
X_train45 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 45)].reset_index().iloc[:,1:]

#remove redundant data
X_train15 = X_train15.iloc[:,2:-2]
X_train30 = X_train30.iloc[:,2:-2]
X_train45 = X_train45.iloc[:,2:-2]

#join observations into single sample
X_train = X_train00.join(X_train15, lsuffix="_00", rsuffix="_15").join(X_train30.join(X_train45, lsuffix="_30", rsuffix="_45"))

#rename column for merging with targets
X_train = X_train.rename(columns={"date_forecast" : "time"})

#concat target values and drop NaN values
targets = pd.concat([train_a,
                     train_b,
                     train_c]).dropna()

#merge weatherfeatures with corresponding target pv measurement
dataset = pd.merge(X_train, targets, how="right", on=["time", "location"])

#shuffle dataset
dataset = dataset.sample(frac=1, random_state=43).reset_index().iloc[:,1:]

#split into features and targets
datasetX = dataset.iloc[:, :-1]
datasetY = dataset.iloc[:, -1:]

#add day and hour feature columns
datasetX["day"] = datasetX["time"].dt.day_of_year
datasetX["hour"] = datasetX["time"].dt.hour

#get indexes of samples in the months of the test dataset
indexMayJuneJuly = datasetX["time"].apply(lambda time : time.month in [5, 6, 7])

#OHE encoding for catagorical feature "location"

# Do not include the data because it could overfit the model
"""
datasetX["location_A"] = datasetX["location"].apply(lambda a : a == "A").map({True: 1, False: 0})
datasetX["location_B"] = datasetX["location"].apply(lambda a : a == "B").map({True: 1, False: 0})
datasetX["location_C"] = datasetX["location"].apply(lambda a : a == "C").map({True: 1, False: 0})
"""

# Therefore also drop location column
datasetX = datasetX.drop("location", axis=1)

#drop time and date_calc columns
datasetX = datasetX.iloc[:,2:]

#calculate mean and std for normalizing data, values should also be used for normalizing test data
dataMean = datasetX.mean()
dataStd = datasetX.std()

#normalize data
datasetX.iloc[:,:-4] = ((datasetX.iloc[:,:-4]-dataMean[:-4])/dataStd[:-4]).fillna(value=0)

#partition into training and evalset
trainsetX = datasetX.iloc[:85000,:]
trainsetY = datasetY.iloc[:85000,:]
trainsetIndexMayJuneJuly = indexMayJuneJuly[:85000]
evalsetX = datasetX.iloc[85000:,:]
evalsetY = datasetY.iloc[85000:,:]
evalsetIndexMayJuneJuly = indexMayJuneJuly[85000:]

#display(datasetX)

  datasetX.iloc[:,:-4] = ((datasetX.iloc[:,:-4]-dataMean[:-4])/dataStd[:-4]).fillna(value=0)


### Clean up highly correlated features

In [47]:
# Create correlation matrix
corr_matrix = trainsetX.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.75
to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]

trainsetX = trainsetX.drop(to_drop, axis=1)
evalsetX = evalsetX.drop(to_drop, axis=1)

## Tuning hyperparameters

In [48]:
# Import tools
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ShuffleSplit

### Define function for doing the randomized search

In [51]:
def searchBestFeatures(regressor, parameters):
    # define splits for CV
    sss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=43)

    # define search
    clf = RandomizedSearchCV(regressor, param_distributions=parameters ,random_state=43, verbose=3, n_iter=10, cv=sss, scoring="neg_mean_absolute_error", n_jobs=-1)

    # perform the search
    search = clf.fit(trainsetX, trainsetY.values.ravel())

    # report the best results
    return search.best_params_

### Find the best hidden_layer_size

In [56]:
regressor = MLPRegressor(random_state=43, max_iter=1500, activation="logistic", solver="adam")

# parameters = {'hidden_layer_sizes':((8,), (16, 8,), (16,), (32, 16,), (32,), (64, 32), (64,))}

parameters = {'hidden_layer_sizes':((256, 128), (256, 128, 64))}

results = searchBestFeatures(regressor, parameters)

print(results)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




[CV 4/5] END ...hidden_layer_sizes=(256, 128);, score=-96.396 total time= 4.9min
[CV 1/5] END ...hidden_layer_sizes=(256, 128);, score=-94.304 total time= 5.3min
[CV 2/5] END ...hidden_layer_sizes=(256, 128);, score=-95.398 total time= 5.3min
[CV 1/5] END hidden_layer_sizes=(256, 128, 64);, score=-107.453 total time= 6.0min
[CV 5/5] END ...hidden_layer_sizes=(256, 128);, score=-93.104 total time= 6.1min
[CV 2/5] END hidden_layer_sizes=(256, 128, 64);, score=-100.681 total time= 6.9min
[CV 3/5] END ...hidden_layer_sizes=(256, 128);, score=-97.665 total time= 7.2min
[CV 3/5] END hidden_layer_sizes=(256, 128, 64);, score=-92.981 total time= 7.7min


KeyboardInterrupt: 

#### Results

The best layer sizes were (64, 32)

### Find the best activation and solver

In [None]:
# updating the regressor with the best parameters
regressor = MLPRegressor(random_state=43, max_iter=1500, hidden_layer_sizes=(64, 32))

parameters = {'activation':('identity', 'logistic', 'tanh', 'relu'), 'solver':('adam',)}

results = searchBestFeatures(regressor, parameters)

print(results)

#### Results

The best combination was activation=logistic and solver=adam

# Making final model

In [74]:
# updating the regressor with the best parameters
model = MLPRegressor(random_state=43, max_iter=1500, hidden_layer_sizes=(258, 128), activation="logistic", solver="adam")

fittedModel = model.fit(trainsetX, trainsetY.values.ravel())

### Model-evaluation

In [75]:
from sklearn.metrics import mean_absolute_error

# Print results for entire dataset
print("entire:")
results = mean_absolute_error(fittedModel.predict(evalsetX), evalsetY)
print(results)

# Print results for May, June and July
print("may june july:")
resultsMayJuneJuly = mean_absolute_error(fittedModel.predict(evalsetX[evalsetIndexMayJuneJuly]), evalsetY[evalsetIndexMayJuneJuly])
print(resultsMayJuneJuly)

entire:
94.35641677739947
may june july:
189.53467374689265


### Results

258,128 - logistic: 94.36, 189.5

150,150 - logistic: 98.7, 197.1

258,128 - relu: 114, 218.7

258,128 - tanh: 102.1, 198.4

### Clean test-data

In [78]:
#Read test dataset
X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

#add location to each sample
X_test_estimated_a["location"] = "A"
X_test_estimated_b["location"] = "B"
X_test_estimated_c["location"] = "C"

#concat all the samples and remove date_calc column
X_test_raw = pd.concat([
                     X_test_estimated_a,
                     X_test_estimated_b,
                     X_test_estimated_c])

#feature indicating time between date_calc and date_forecast
X_test_raw["calc_time"] =(X_test_raw["date_forecast"] - X_test_raw["date_calc"]).astype('timedelta64[s]')

#fill nans
X_test_raw["snow_density:kgm3"] = X_test_raw["snow_density:kgm3"].apply(
    lambda a : np.isnan(a)
    ).map({True: 0, False: 1})
X_test_raw["ceiling_height_agl:m"] = X_test_raw["ceiling_height_agl:m"].apply(
    lambda a : -1000 if np.isnan(a) else a
)
X_test_raw["cloud_base_agl:m"] = X_test_raw["ceiling_height_agl:m"].apply(
    lambda a : -1000 if np.isnan(a) else a
)

#create seperate dataframes for measurments at minute 00, 15, 30 and 45
X_test00 = X_test_raw[X_test_raw["date_forecast"].apply(lambda time: time.minute == 0)].reset_index().iloc[:,1:]
X_test15 = X_test_raw[X_test_raw["date_forecast"].apply(lambda time: time.minute == 15)].reset_index().iloc[:,1:]
X_test30 = X_test_raw[X_test_raw["date_forecast"].apply(lambda time: time.minute == 30)].reset_index().iloc[:,1:]
X_test45 = X_test_raw[X_test_raw["date_forecast"].apply(lambda time: time.minute == 45)].reset_index().iloc[:,1:]

#remove redundant data
X_test15 = X_test15.iloc[:,2:-2]
X_test30 = X_test30.iloc[:,2:-2]
X_test45 = X_test45.iloc[:,2:-2]

#join observations into single sample
X_test_estimated = X_test00.join(X_test15, lsuffix="_00", rsuffix="_15").join(X_test30.join(X_test45, lsuffix="_30", rsuffix="_45"))

#rename column for merging with targets
X_test_estimated = X_test_estimated.rename(columns={"date_forecast" : "time"})

#parse dates
parse_dates = ['time']
X_test_targets = pd.read_csv("test.csv", parse_dates=parse_dates)

#merge weatherfeatures with corresponding target pv measurement
X_test = pd.merge(X_test_estimated, X_test_targets, on=["time", "location"], how="right").iloc[:,:-2]

#add day and hour feature columns
X_test["day"] = X_test["time"].dt.day_of_year
X_test["hour"] = X_test["time"].dt.hour

# Do not include the data because it could overfit the model
"""
X_test["location_A"] = X_test["location"].apply(lambda a : a == "A").map({True: 1, False: 0})
X_test["location_B"] = X_test["location"].apply(lambda a : a == "B").map({True: 1, False: 0})
X_test["location_C"] = X_test["location"].apply(lambda a : a == "C").map({True: 1, False: 0})
"""

# Therefore also drop location column
X_test = X_test.drop("location", axis=1)

#drop time and date_calc columns
X_test = X_test.iloc[:,2:]

#normalize data
X_test.iloc[:,:-4] = ((X_test.iloc[:,:-4]-dataMean[:-4])/dataStd[:-4]).fillna(value=0)

# drop features with high correlation
X_test = X_test.drop(to_drop, axis=1)

  X_test.iloc[:,:-4] = ((X_test.iloc[:,:-4]-dataMean[:-4])/dataStd[:-4]).fillna(value=0)


In [79]:
preds = fittedModel.predict(X_test)

display(preds)

array([-0.13922708, -6.09931622,  6.10450605, ..., 32.54562509,
        5.69187449,  3.41269792])

### Actual prediction

In [80]:
test_preds = pd.read_csv("sample_submission.csv")
test_preds["prediction"] = fittedModel.predict(X_test)
display(test_preds)
test_preds.to_csv("MLPRegressor_1.csv", index=False)

Unnamed: 0,id,prediction
0,0,-0.139227
1,1,-6.099316
2,2,6.104506
3,3,17.624717
4,4,417.324748
...,...,...
2155,2155,41.869631
2156,2156,36.503372
2157,2157,32.545625
2158,2158,5.691874
