### Table of Contents

1. [Libraries](#libraries)

2. [Data Preprocessing](#DataPreprocessing)

3. [Models and Hyperparametrization](#ModelsandHyperparametrization) 

4. [Validating the Models with Metrics](#Metrics)
     

# 1. Libraries <a class="anchor"  id="libraries"></a>

In [14]:
# Base Libraries
import matplotlib.pyplot as plt  
import numpy as np
import os
import pandas as pd
import seaborn as sns
# Transformation
from sklearn import preprocessing
# Models
from scipy.stats import skew
from scipy.stats import kurtosis
from sklearn import neighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# Metrics
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score

# 2. Data Preprocessing <a class="anchor"  id="DataPreprocessing"></a>

In [15]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv('/kaggle/input/solar-power-generation/BigML_Dataset_5f50a4cc0d052e40e6000034.csv', sep=',')

pd.options.display.float_format = '{:.4f}'.format
df.info()

In [16]:
print(df.shape)
df.head()

In [17]:
df = df.drop(columns = ['Month','Day'])

df["Is Daylight"] = df["Is Daylight"].astype(int)
df.head(10)


In [18]:
df.describe()

In [19]:
df.info()

In [20]:
df.fillna(0, inplace=True)

In [21]:
df.isnull().sum()

In [22]:
df.isna().sum()

In [23]:
subdata = df.groupby('Year').sum()['Power Generated']
subdata.plot(kind='pie',legend = True, subplots=True)

In [24]:
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(df.corr().abs(), annot=True, linewidths=.5, ax=ax)

In [25]:
df.hist(bins=50,figsize=(20,15)) 
plt.show()

In [26]:
graph = sns.displot(data=df, x="Power Generated",kind ='kde',color='blue')
for ax in graph.axes.ravel():
    ax.text(x=0.97, y=0.97, transform=ax.transAxes, s="Skewness: %f" % df['Power Generated'].skew(),\
        fontsize=10, verticalalignment='top', horizontalalignment='right')
    ax.text(x=0.97, y=0.91, transform=ax.transAxes, s="Kurtosis: %f" % df['Power Generated'].kurt(),\
        fontsize=10, verticalalignment='top', horizontalalignment='right')

In [27]:
fig, (ax1) = plt.subplots(ncols=1, figsize=(10,8))
ax1.set_title('Original Distributions')

sns.kdeplot(df['Power Generated'], ax=ax1, label='Power Generated')
sns.kdeplot(df['Average Temperature (Day)'], ax=ax1, label='Average Temperature')
sns.kdeplot(df['Average Wind Speed (Day)'], ax=ax1, label='Average Wind Speed')
sns.kdeplot(df['Relative Humidity'], ax=ax1, label='Relative Humidity')
sns.kdeplot(df['Average Wind Speed (Period)'], ax=ax1, label='Average Wind Speed')
sns.kdeplot(df['Average Wind Direction (Day)'], ax=ax1, label='Average Wind Direction')

ax1.legend()
ax1.set_xlabel ('Value')
ax1.set_ylabel ('Frequency')

In [28]:
scaler = preprocessing.MinMaxScaler()

df[['Power Generated',
    'Average Temperature (Day)',
    'Average Wind Speed (Day)',
    'Relative Humidity',
    'Average Wind Speed (Period)',
    'Average Wind Direction (Day)'
   ]] = scaler.fit_transform(df[['Power Generated',
                                 'Average Temperature (Day)',
                                 'Average Wind Speed (Day)',
                                 'Relative Humidity',
                                 'Average Wind Speed (Period)',
                                 'Average Wind Direction (Day)'
                                ]])

In [29]:
fig, (ax1) = plt.subplots(ncols=1,figsize=(10, 8))
ax1.set_title('After MinMaxScaler')

sns.kdeplot(df['Power Generated'], ax=ax1, label='Power Generated')
sns.kdeplot(df['Average Temperature (Day)'], ax=ax1, label='Average Temperature')
sns.kdeplot(df['Average Wind Speed (Day)'], ax=ax1, label='Average Wind Speed')
sns.kdeplot(df['Relative Humidity'], ax=ax1, label='Relative Humidity')
sns.kdeplot(df['Average Wind Speed (Period)'], ax=ax1, label='Average Wind Speed')
sns.kdeplot(df['Average Wind Direction (Day)'], ax=ax1, label='Average Wind Direction')

ax1.legend()
ax1.set_xlabel ('Value')
ax1.set_ylabel ('Frequency')

In [30]:
df.head(10)

# 3. Models and Hyperparametrization <a class="anchor"  id="ModelsandHyperparametrization"></a>

In [31]:
X = df.iloc[:,0:10]
y = df.iloc[:,10]

In [32]:
X.head(10)

In [33]:
y.head()

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.7, random_state=0)

## 3.1 Testing *k* Factors <a class="anchor"  id="TestingkFactors"></a>

In [47]:
rmsle_val = []
best_rmsle = 1.0

for k in range(20):
    k = k+1
    knn = neighbors.KNeighborsRegressor(n_neighbors = k)

    knn.fit(X_train, y_train) 
    y_pred = knn.predict(X_test)
    rmsle = np.sqrt(mean_squared_log_error(y_test,y_pred))
    if (rmsle < best_rmsle):
        best_rmsle = rmsle
        best_k = k
    rmsle_val.append(rmsle)
    print('RMSLE value for k= ' , k , 'is:', rmsle)

print(f"Best RMSLE: {best_rmsle}, Best k: {best_k}")

In [48]:
curve = pd.DataFrame(rmsle_val)
curve.plot(figsize=(8,5))

## 3.2 Tuning Hyperparameters <a class="anchor"  id="TuningHyperparameters"></a>

In [49]:
params = {'n_neighbors':[2,3,4,5,6,7,8,9,10,11]}

knn = neighbors.KNeighborsRegressor()

model = GridSearchCV(knn, params, cv=5)
model.fit(X_train,y_train)
model.best_params_

## 3.3 Models <a class="anchor"  id="Models"></a>

### 3.3.1 K-Nearest Neighbor

In [50]:
knn = neighbors.KNeighborsRegressor(n_neighbors = 5)

knn.fit(X_train,y_train)
y_pred_knn = knn.predict(X_test)
knn.score(X_test, y_test)

### 3.3.2 Random Forest

In [51]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(X_train, y_train)
y_pred_rf = forest_model.predict(X_test)
forest_model.score(X_test, y_test)

### 3.3.3 Ensemble Stacking

In [52]:
estimators=[("knn", knn), ("rf", forest_model)]
ensemble_stack = StackingRegressor(estimators=estimators)
ensemble_stack.fit(X_train, y_train)

y_pred_stacking = ensemble_stack.predict(X_test)
ensemble_stack.score(X_test, y_test)

## 4. Validating the Models with Metrics <a class="anchor"  id="Metrics"></a>

In [53]:
knn_evs_valid = explained_variance_score(y_test, y_pred_knn, multioutput='uniform_average')
knn_max_error_valid = max_error(y_test, y_pred_knn)
knn_mape_valid = mean_absolute_percentage_error(y_test, y_pred_knn)
knn_rmsle_valid = np.sqrt(mean_squared_log_error(y_test, y_pred_knn))
knn_mse_valid = mean_squared_error(y_test, y_pred_knn) 
knn_r2_valid = r2_score(y_test, y_pred_knn)

print('KNN - EVS      Valid:', knn_evs_valid)
print('KNN - MaxError Valid:', knn_max_error_valid)
print('KNN - MAPE     Valid:', knn_mape_valid)
print('KNN - RMSLE    Valid:', knn_rmsle_valid)
print('KNN - MSE      Valid:', knn_mse_valid)
print('KNN - R2       Valid:', knn_r2_valid)

In [54]:
rf_evs_valid = explained_variance_score(y_test, y_pred_rf, multioutput='uniform_average')
rf_max_error_valid = max_error(y_test, y_pred_rf)
rf_mape_valid = mean_absolute_percentage_error(y_test, y_pred_rf)
rf_rmsle_valid = np.sqrt(mean_squared_log_error(y_test, y_pred_rf))
rf_mse_valid = mean_squared_error(y_test, y_pred_rf) 
rf_r2_valid = r2_score(y_test, y_pred_rf)


print('RF - EVS      Valid:', rf_evs_valid)
print('RF - MaxError Valid:', rf_max_error_valid)
print('RF - MAPE     Valid:', rf_mape_valid)
print('RF - RMSLE    Valid:', rf_rmsle_valid)
print('RF - MSE      Valid:', rf_mse_valid)
print('RF - R2       Valid:', rf_r2_valid)

In [55]:
stacking_evs_valid = explained_variance_score(y_test, y_pred_stacking, multioutput='uniform_average')
stacking_max_error_valid = max_error(y_test, y_pred_stacking)
stacking_mape_valid = mean_absolute_percentage_error(y_test, y_pred_stacking)
#stacking_rmsle_valid = np.sqrt(mean_squared_log_error(y_test, y_pred_stacking))
stacking_mse_valid = mean_squared_error(y_test, y_pred_stacking) 
stacking_r2_valid = r2_score(y_test, y_pred_stacking)


print('Stacking - EVS      Valid:', stacking_evs_valid)
print('Stacking - MaxError Valid:', stacking_max_error_valid)
print('Stacking - MAPE     Valid:', stacking_mape_valid)
#print('Stacking - RMSLE    Valid:', stacking_rmsle_valid)
print('Stacking - MSE      Valid:', stacking_mse_valid)
print('Stacking - R2       Valid:', stacking_r2_valid)

In [56]:
data_prediction_knn = list(zip(y_test,y_pred_knn))
data_prediction_knn = pd.DataFrame(data_prediction_knn, columns=['Test','Prediction-Knn'])

data_prediction_rf = list(zip(y_test,y_pred_rf))
data_prediction_rf = pd.DataFrame(data_prediction_rf, columns=['Test','Prediction-RF'])

data_prediction_stacking = list(zip(y_test,y_pred_stacking))
data_prediction_stacking = pd.DataFrame(data_prediction_stacking, columns=['Test','Prediction-Stacking'])


In [57]:
dfs = [data_prediction_knn, data_prediction_rf, data_prediction_stacking]
dfs = [df.set_index('Test') for df in dfs] 
dfs = pd.concat(dfs, join='outer', axis = 1)

dfs.head(20)