In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.models import Sequential
from collections import Counter

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

### **Loading Data**

In [None]:
data = pd.read_csv("SolarPrediction.csv")

In [None]:
data.head(5)

In [None]:
data.info()

## **Data Wrangling**

In [None]:
df = data.copy()

extract the date from the date_time format of the 'Data' parameter

In [None]:
df["Data"] = df["Data"].apply(lambda x: x.split()[0])

In [None]:
df.head()

In [None]:
df['Month'] = pd.to_datetime(df['Data']).dt.month
df['Day'] = pd.to_datetime(df['Data']).dt.day
df['Hour'] = pd.to_datetime(df['Time']).dt.hour
df['Minute'] = pd.to_datetime(df['Time']).dt.minute
df['Second'] = pd.to_datetime(df['Time']).dt.second


In [None]:
df.head()

Extracting Sunrise information using regular expression

In [None]:
df['risehour'] = df['TimeSunRise'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(int)
df['riseminute'] = df['TimeSunRise'].apply(lambda x: re.search(r'(?<=\:)\d+(?=\:)', x).group(0)).astype(int)

df['sethour'] = df['TimeSunSet'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(int)
df['setminute'] = df['TimeSunSet'].apply(lambda x: re.search(r'(?<=\:)\d+(?=\:)', x).group(0)).astype(int)

In [None]:
df.head()

In [None]:
df.info()

Dropping the parameters that are not required

In [None]:
df.drop(['UNIXTime', 'Data', 'Time', 'TimeSunRise', 'TimeSunSet'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum().sum()

In [None]:
df.head()

In [None]:
np.array(df['Radiation'])

In [None]:
input_features = df.drop(['Radiation'], axis=1)
target = df['Radiation']

### Feature Selection Using Correlation Matrix

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)

### Feature Selection Using SelectKBest Method

In [None]:
bestfeatures = SelectKBest(score_func=chi2, k=10)

In [None]:
label_encoder = LabelEncoder()
train_Y = label_encoder.fit_transform(target)
train_Y

In [None]:
target_cont = df['Radiation'].apply(lambda x: int(x * 100))
target_cont

In [None]:
scaled_input_features = MinMaxScaler().fit_transform(input_features)
fit = bestfeatures.fit(scaled_input_features, target_cont)

In [None]:
scores = pd.DataFrame(fit.scores_)
column = pd.DataFrame(input_features.columns)

In [None]:
featureScores = pd.concat([column, scores], axis=1)

In [None]:
featureScores

In [None]:
featureScores.columns = ['Features', 'feature_imp']

In [None]:
featureScores.sort_values(by='feature_imp', ascending=False, inplace=True)
featureScores

In [None]:
plt.figure(figsize= (10,6))
plt.bar(featureScores.Features, featureScores.feature_imp)
plt.xticks(rotation = 90) 
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature importance using Extra Tree Classifier')
plt.show()

### Feature Selection Using Extra Tree Classifier

In [None]:
model = ExtraTreesClassifier(n_estimators=10, verbose=2)
model.fit(scaled_input_features, target_cont)

In [None]:
feature_importances = pd.DataFrame(model.feature_importances_, index=input_features.columns, columns=['feature_imp'])
feature_importances.sort_values(by='feature_imp', ascending=False, inplace=True)
feature_importances

In [None]:
plt.figure(figsize= (10,6))
plt.bar(feature_importances.index, feature_importances['feature_imp'])
plt.xticks(rotation = 90) 
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature importance using Extra Tree Classifier')
plt.show()

### Feature Engineering With BoxCox, Log, Min-Max and Standard Transformation

In [None]:
features_to_transform = ['Temperature', 'Pressure', 'Humidity', 'Speed', 'WindDirection(Degrees)']

for i in features_to_transform:
    fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(5,1 , figsize = (10,5))
    
    pd.DataFrame(input_features[i]).hist(ax=ax1, bins= 50)
    #Log Transformation
    pd.DataFrame((input_features[i]+1).transform(np.log)).hist(ax=ax2, bins= 50)
    #BoxCox Transformation
    pd.DataFrame(stats.boxcox(input_features[i]+1)[0]).hist(ax=ax3, bins= 50)
    #Standard Scalar Transformation
    pd.DataFrame(StandardScaler().fit_transform(np.array(input_features[i]).reshape(-1,1))).hist(ax=ax4, bins= 50)
    #MinMaxScalar Transformation
    pd.DataFrame(MinMaxScaler().fit_transform(np.array(input_features[i]).reshape(-1,1))).hist(ax=ax5, bins= 50)

    ax1.set_ylabel('Normal')
    ax2.set_ylabel('Log')
    ax3.set_ylabel('BoxCox')
    ax4.set_ylabel('Standard')
    ax5.set_ylabel('MinMax')


In [None]:
transform = {
    "Temperature" : (input_features["Temperature"]+1).transform(np.log),
    "Pressure" : stats.boxcox(input_features['Pressure']+1)[0],
    "Humidity" : stats.boxcox(input_features['Humidity']+1)[0],
    "Speed" : (input_features["Speed"]+1).transform(np.log),
    "WindDirection(Degrees)" : StandardScaler().fit_transform(np.array(input_features[i]).reshape(-1,1))
}

In [None]:
for i in transform:
    input_features[i] = transform[i]

In [None]:
input_features.head()

### Prepairing Data - Standardisation and Splitting

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(input_features, target, test_size=0.2, random_state=1)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)


In [None]:
xtrain.shape, xtest.shape

## XGBoost (Boosting Algorithm)

In [None]:
from xgboost import XGBRegressor

In [None]:
#Declaring the parameters
params = {
    'learning_rate': 0.1,
    'max_depth': 8}

from xgboost import XGBRegressor
model = XGBRegressor(**params)

In [None]:
#training the model
model.fit(xtrain, ytrain)

In [None]:
y_pred = model.predict(xtest)

In [None]:
print('XGBoost model result: {0:0.4f}'. format(np.sqrt(mean_squared_error(ytest, y_pred))))

In [None]:
rmse = np.sqrt(mean_squared_error(ytest, y_pred))
r2 = r2_score(ytest, y_pred)

print("Testing performance")

print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))

### MultiLayer Perceptron for prediction

In [None]:
input_features.shape

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(input_features, target, test_size=0.2, random_state=1)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

In [None]:
model = Sequential()

model.add(Dense(128, activation='relu', input_dim=14))
model.add(Dropout(0.33))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.33))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.33))

model.add(Dense(1, activation='linear'))

model.compile(metrics=['mse'], loss='mae', optimizer=Adam(learning_rate=0.001))
print(model.summary())


In [None]:
fit = history.history
for i in fit:
    plt.plot(fit[i])
    plt.title(i + ' over epochs')
    plt.ylabel(i)
    plt.xlabel('epochs')
    plt.show()

In [None]:
scores = model.evaluate(xtest, ytest)
mae = scores[0]
mse = scores[1]
print('Mean absolute error: ', mae)

In [None]:
mean_absolute_error(ytest, model.predict(xtest))