In [None]:
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split, KFold
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor

In [None]:
import os 
os.chdir('../input/solar-radiation-prediction')

In [None]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

# Data Exploration

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
(train.isnull().sum()).sum()

In [None]:
train.hist(figsize=[8,8])
plt.show()

In [None]:
train.corr()["Radiation"].sort_values(ascending = False)

In [None]:
f, ax = plt.subplots(figsize=(6,6))
sns.distplot(train['Radiation'])
plt.xlim([1.13,1602])

We realized that all the measurement of time were taken in a timezone 10 hours ahead of the utc time, which corresponds to the Hawaiian time. 

In [None]:
def get_date(s):
    l = s.split()
    return l[0]

In [None]:
datetime_local = (train['Data'].apply(get_date) + ' ' + train['Time']).apply(pd.Timestamp)

In [None]:
datetime_utc = train['UNIXTime'].apply(datetime.datetime.utcfromtimestamp)

In [None]:
(datetime_utc - datetime_local).value_counts()

# Feature Engineering

We noticed that time was represented with several columns in the dataset. In order to be able to use the Time measurement, the TimeSunRise and TimeSunSet, we converted all those time columns to the seconds elapsed from the midnight of the current day.
We kept UNIXTime as an absolute representation and we extracted a relative representation, with respect to sunrise and sunset. Hence, we created a new column called TimeFromSunRise. The data of this column represents the spread in seconds between the data collection and the sunrise, a negative value represents an event before sunrise and a positive value represents an event happened after it. We did the same with TimeFromSunSet. We then dropped the original columns, because all the relevant information is contained in the two newly created columns.

In [None]:
def time_to_seconds(s):
    dt = datetime.datetime.strptime(s, "%H:%M:%S") - datetime.datetime(1900,1,1)
    return dt.total_seconds()

In [None]:
train["Time"] = train["Time"].apply(time_to_seconds)
train["TimeSunRise"] = train["TimeSunRise"].apply(time_to_seconds)
train["TimeSunSet"] = train["TimeSunSet"].apply(time_to_seconds)
train['TimeFromSunRise'] = train['Time'] - train['TimeSunRise']
train['TimeFromSunSet'] = train['Time'] - train['TimeSunSet']
train.drop(columns=['Time', 'TimeSunRise', 'TimeSunSet', 'Data'], inplace=True)

# Model Selection and Tuning

We performed a preliminary evaluation of some of the possible models to get a benchmark and tree-based models seemed a suitable choice for the problem.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='Radiation'), train['Radiation'], test_size = 0.2, random_state = 42)
regressors = [LinearRegression(),
              XGBRegressor(),
              RandomForestRegressor()]

for model in regressors:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    error = mean_squared_error(y_test, predictions)
    print(f'MSE {type(model).__name__}: {error}')

Given the smaller MSE we got with XGBoost and Random Forest, we proceeded with the tuning of these two models. However, from the output below, we realised that the fifth and sixth features corresponding to WindDirection(Degrees) and Speed have significantly lower impact on the result compared to the other features. We inferred that wind data may not be that relevant in estimating solar radiation, hence we dropped those columns.

In [None]:
random_forest = RandomForestRegressor()
random_forest.fit(train.drop(columns='Radiation'), train['Radiation'])
random_forest.feature_importances_ 

In [None]:
xgb = XGBRegressor()
xgb.fit(train.drop(columns='Radiation'), train['Radiation'])
xgb.feature_importances_

In [None]:
features = train.drop(columns=['Radiation', 'WindDirection(Degrees)', 'Speed'])
target = train['Radiation']

The models will be trained on this dataset:

In [None]:
features.head()

In [None]:
#RANDOM FOREST TUNING:
param_grid = {'n_estimators': np.arange(500, 700, 10),
              'max_depth': np.arange(4, 21),
              'min_samples_split': [0.001, 0.01, 0.1, 2],
              'min_samples_leaf': [0.001, 0.01, 0.1, 1],
              'max_features': np.arange(3, 7), 
              'n_jobs': [-1]
              }

rf = RandomForestRegressor()
random_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=25, scoring='neg_mean_squared_error', n_jobs=-1, refit=False)
random_search.fit(features, target)

In [None]:
# XGB TUNING:
param_grid = {'learning_rate': np.arange(0.01, 0.1, 0.01),
              'n_estimators': np.arange(2, 500, 10), 
              'subsample': [0.7, 0.8, 0.9, 1.0],
              'max_depth': np.arange(4,21), 
              'min_samples_split': [0.001, 0.01, 0.1, 2],
              'min_samples_leaf': [0.001, 0.01, 0.1, 1],
              'n_jobs': [-1]
              }

xgb = XGBRegressor()
random_search = RandomizedSearchCV(xgb, param_distributions=param_grid, n_iter=30, scoring='neg_mean_squared_error', n_jobs=-1, refit=False)
random_search.fit(features, target)

In [None]:
# Tuned Models:
rf = RandomForestRegressor(n_estimators = 550,
                           max_depth = 19,
                           max_features = 3,
                           random_state = 42,
                           n_jobs = -1)

xgb = XGBRegressor(learning_rate = 0.06, 
                   estimators = 2, 
                   max_depth = 11, 
                   min_samples_leaf = 1, 
                   min_samples_split = 0.001, 
                   subsample = 0.7)

# Performance Evaluation

The evaluation of the performance was carried out through 5-fold cross-validation. The tuned models performed similarly on average, but slightly differently on individual folds. We resorted to ensemble models to reduce such variance. 

In [None]:
estimators = [('rf', rf), ('xgb', xgb)]
kf = KFold(shuffle=True, random_state=42)

In [None]:
for _, estimator in estimators:
    scores = cross_val_score(estimator, features, target, scoring='neg_mean_squared_error', cv=kf, n_jobs=-1)
    print(f'{type(estimator).__name__}: scores: {scores}, avg: {np.mean(scores)}')

# Final Model

We evaluated three ensemble estimators: 

*   Voting Regressor, averaging the predictions of base estimators
*   Stacking Regressor, with Ridge Regression as final estimator
*   Stacking Regressor, with Elastic Net Regression as final estimator
 
Final results were satisfactory: this additional layer of ensemble led to a significant improvement with respect to individual models. The difference among the three was minimal and we chose Voting Regressor for our final submission. 

In [None]:
elastic_net = ElasticNetCV()
ensemble_models = [VotingRegressor(estimators, n_jobs=-1),
                   StackingRegressor(estimators, cv=kf, n_jobs=-1),
                   StackingRegressor(estimators, elastic_net, cv=kf, n_jobs=-1)]

for model in ensemble_models:
    scores = cross_val_score(model, features, target, scoring='neg_mean_squared_error', cv=kf, n_jobs=-1)
    print(f'{type(model).__name__}: scores: {scores}, avg: {np.mean(scores)}')

In [None]:
voting = VotingRegressor(estimators, n_jobs=-1)
voting.fit(features, target)

In [None]:
test["Time"] = test["Time"].apply(time_to_seconds)
test["TimeSunRise"] = test["TimeSunRise"].apply(time_to_seconds)
test["TimeSunSet"] = test["TimeSunSet"].apply(time_to_seconds)
test['TimeFromSunRise'] = test['Time'] - test['TimeSunRise']
test['TimeFromSunSet'] = test['Time'] - test['TimeSunSet']
test.drop(columns = ['Time', 'TimeSunRise', 'TimeSunSet', 'Data', 'WindDirection(Degrees)', 'Speed'], inplace=True)

In [None]:
prediction = voting.predict(test)
test_target = pd.read_csv('test_target.csv')
mean_squared_error(prediction, test_target)