# Solar Radiation Prediction

In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

*The dataset contains meteorological data from the HI-SEAS Habitat in Hawaii and the aim is to predict solar irradiance from other meteorological parameters contained within the dataset.*

***Importing Libraries***

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
from pytz import timezone
import pytz

***Reading the data***

In [15]:
df = pd.read_csv('/kaggle/input/SolarEnergy/SolarPrediction.csv')

***Analyzing rows and columns***

In [16]:
df.head()

In [17]:
df.tail()

***Data Preprocessing Steps***

In [18]:
hawaii= timezone('Pacific/Honolulu')
df.index =  pd.to_datetime(df['UNIXTime'], unit='s')
df.index = df.index.tz_localize(pytz.utc).tz_convert(hawaii)

In [19]:

df['MonthOfYear'] = df.index.strftime('%m').astype(int)
df['DayOfYear'] = df.index.strftime('%j').astype(int)
df['WeekOfYear'] = df.index.strftime('%U').astype(int)
df['TimeOfDay(h)'] = df.index.hour
df['TimeOfDay(m)'] = df.index.hour*60 + df.index.minute
df['TimeOfDay(s)'] = df.index.hour*60*60 + df.index.minute*60 + df.index.second
df['TimeSunRise'] = pd.to_datetime(df['TimeSunRise'], format='%H:%M:%S')
df['TimeSunSet'] = pd.to_datetime(df['TimeSunSet'], format='%H:%M:%S')


In [20]:
df['DayLength(s)'] = df['TimeSunSet'].dt.hour*60*60 \
                           + df['TimeSunSet'].dt.minute*60 \
                           + df['TimeSunSet'].dt.second \
                           - df['TimeSunRise'].dt.hour*60*60 \
                           - df['TimeSunRise'].dt.minute*60 \
                           - df['TimeSunRise'].dt.second

In [21]:
df.drop(['Data','Time','TimeSunRise','TimeSunSet'], inplace=True, axis=1)
df.head()

*The above preprocessing steps are done to in the prescribed time format of month, days, years etc as it is quite useful while doing the analysis and also data is made ready for visualization.*

***Data Visualization***

In [22]:
grouped_m=df.groupby('MonthOfYear').mean().reset_index()
grouped_w=df.groupby('WeekOfYear').mean().reset_index()
grouped_d=df.groupby('DayOfYear').mean().reset_index()
grouped_h=df.groupby('TimeOfDay(h)').mean().reset_index()


In [23]:
f, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4, 2, sharex='col', sharey='row', figsize=(14,12))
ax3.set_ylim(45,60)
ax5.set_ylim(30.36,30.46)
ax7.set_ylim(60,85)

ax1.set_title('Mean Radiation by Hour')
pal = sns.color_palette("pastel", len(grouped_h))
rank = grouped_h['Radiation'].argsort().argsort() 
g = sns.barplot(x="TimeOfDay(h)", y='Radiation', data=grouped_h, palette=np.array(pal[::-1])[rank], ax=ax1)
ax1.set_xlabel('')

ax2.set_title('Mean Radiation by Month')
pal = sns.color_palette("husl", len(grouped_m))
rank = grouped_m['Radiation'].argsort().argsort() 
g = sns.barplot(x="MonthOfYear", y='Radiation', data=grouped_m, palette=np.array(pal[::-1])[rank], ax=ax2)
ax2.set_xlabel('')


ax3.set_title('Mean Temperature by Hour')
pal = sns.color_palette("flare", len(grouped_h))
rank = grouped_h['Temperature'].argsort().argsort() 
g = sns.barplot(x="TimeOfDay(h)", y='Temperature', data=grouped_h, palette=np.array(pal[::-1])[rank], ax=ax3)
ax3.set_xlabel('')

ax4.set_title('Mean Temperature by Month')
pal = sns.color_palette("YlOrRd_r", len(grouped_m))
rank = grouped_m['Temperature'].argsort().argsort() 
g = sns.barplot(x="MonthOfYear", y='Temperature', data=grouped_m, palette=np.array(pal[::-1])[rank], ax=ax4)
ax4.set_xlabel('')

ax5.set_title('Mean Pressure by Hour')
pal = sns.color_palette("husl", len(grouped_h))
rank = grouped_h['Pressure'].argsort().argsort() 
g = sns.barplot(x="TimeOfDay(h)", y='Pressure', data=grouped_h, palette=np.array(pal[::-1])[rank], ax=ax5)
ax5.set_xlabel('')

ax6.set_title('Mean Pressure by Month')
pal = sns.color_palette("Set2", len(grouped_m))
rank = grouped_m['Pressure'].argsort().argsort() 
g = sns.barplot(x="MonthOfYear", y='Pressure', data=grouped_m, palette=np.array(pal[::-1])[rank], ax=ax6)
ax6.set_xlabel('')

ax7.set_title('Mean Humidity by Hour')
pal = sns.color_palette("light:#5A9", len(grouped_h))
rank = grouped_h['Humidity'].argsort().argsort() 
g = sns.barplot(x="TimeOfDay(h)", y='Humidity', data=grouped_h, palette=np.array(pal[::-1])[rank], ax=ax7)

ax8.set_title('Mean Humidity by Month')
pal = sns.color_palette("ch:s=.25,rot=-.25", len(grouped_m))
rank = grouped_m['Humidity'].argsort().argsort() 
g = sns.barplot(x="MonthOfYear", y='Humidity', data=grouped_m, palette=np.array(pal[::-1])[rank], ax=ax8)

plt.show()

In [24]:
corrmat = df.drop(['TimeOfDay(h)', 'TimeOfDay(m)', 'TimeOfDay(s)', 'UNIXTime', 'MonthOfYear', 'WeekOfYear'], inplace=False, axis=1)
corrmat = corrmat.corr()
f, ax = plt.subplots(figsize=(7,7))
sns.heatmap(corrmat, vmin=-.8, vmax=.8, square=True)
plt.show()

*The next thing is a visualization of the modified variables which is visualized using bar plots. The heatmap shown above shows that temperature is strongly correlated and humidity is negatively correlated with solar irradiance. The solar irradiance and temperature both peak at approximately noon. Additionally, monthly means of both solar irradiance and temperature appear to decrease as winter approaches, except for a very slight increase in solar irradiance from September to October.*

***Separating the Independent and Dependent Variables***

In [25]:
X = df[['Temperature', 'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed', 'DayOfYear', 'TimeOfDay(s)']]
y = df['Radiation']

***Splitting the Dataset into train and test set***

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

***Feature Selection***

In [27]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
regressor = XGBRegressor(n_estimators = 1000,max_depth=3,subsample=0.8,random_state=0)
regressor.fit(X_train, y_train)
feature_importances = regressor.feature_importances_
X_train_opt = X_train.copy()
removed_columns = pd.DataFrame()
models = []
r2s_opt = []

for i in range(0,5):
    least_important = np.argmin(feature_importances)
    removed_columns = removed_columns.append(X_train_opt.pop(X_train_opt.columns[least_important]))
    regressor.fit(X_train_opt, y_train)
    feature_importances = regressor.feature_importances_
    accuracies = cross_val_score(estimator = regressor,
                                 X = X_train_opt,
                                 y = y_train, cv = 5,
                                 scoring = 'r2')
    r2s_opt = np.append(r2s_opt, accuracies.mean())
    models = np.append(models, ", ".join(list(X_train_opt)))
    
feature_selection = pd.DataFrame({'Features':models,'r2 Score':r2s_opt})
feature_selection.head()

*Here we will see the features which are best for the model used which is XGBoost and also use it while fitting with the regressor the metrics used here is the r2 score to determine which features are best.*

***Fitting the Regressor to the Key Features***

In [28]:
X_train_best = X_train[['Temperature', 'Pressure', 'Humidity','DayOfYear','TimeOfDay(s)']]
X_test_best = X_test[['Temperature', 'Pressure', 'Humidity','DayOfYear','TimeOfDay(s)']]
regressor.fit(X_train_best, y_train)

***Predicting the Test Set***

In [29]:
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score
y_pred = regressor.predict(X_test_best)
explained_variance_score = explained_variance_score(y_test, y_pred)
mean_squared_error = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print('explained variance = {}'.format(explained_variance_score))
print('mse = {}'.format(mean_squared_error))
print('r2 = {}'.format(r_squared))

The variables most relevant to the prediction of solar irradiance were found to be temperature, pressure, humidity, DayOfYear, and TimeOfDay(s) which is supported by the r2 score of the features selected in groups. Then the XGBoost worked well with the test data too with an r2 score of around 92%. To create a more useful model, the regressor should be trained on data recorded over several years.
