In [120]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [33]:
renewable_data = pd.read_csv('Renewable Energy Data.csv', na_values = 0)

In [34]:
renewable_data.head()

Unnamed: 0,Year,Month,Sector,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Total Renewable Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel
0,1973,1,Commerical,,,,,0.57,,,,0.57,0.57,,,,
1,1973,1,Electric Power,,0.49,,,0.054,0.157,,,0.211,89.223,,,88.522,
2,1973,1,Industrial,1.04,,,,98.933,,,,98.933,99.973,,,,
3,1973,1,Residential,,,,,30.074,,,,,30.074,,,,
4,1973,1,Transportation,,,,,,,,,,,,,,


In [35]:
renewable_data.shape

(3065, 17)

In [36]:
renewable_data.dtypes

Year                                    int64
Month                                   int64
Sector                                 object
Hydroelectric Power                   float64
Geothermal Energy                     float64
Solar Energy                          float64
Wind Energy                           float64
Wood Energy                           float64
Waste Energy                          float64
Fuel Ethanol, Excluding Denaturant    float64
Biomass Losses and Co-products        float64
Biomass Energy                        float64
Total Renewable Energy                float64
Renewable Diesel Fuel                 float64
Other Biofuels                        float64
Conventional Hydroelectric Power      float64
Biodiesel                             float64
dtype: object

In [37]:
renewable_data[renewable_data.duplicated()]

Unnamed: 0,Year,Month,Sector,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Total Renewable Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel


In [38]:
# Number of missing values per feature
renewable_data.isna().sum()

Year                                     0
Month                                    0
Sector                                   0
Hydroelectric Power                   2040
Geothermal Energy                     1189
Solar Energy                          1330
Wind Energy                           2274
Wood Energy                            613
Waste Energy                          1514
Fuel Ethanol, Excluding Denaturant    1514
Biomass Losses and Co-products        2548
Biomass Energy                         709
Total Renewable Energy                 613
Renewable Diesel Fuel                 2908
Other Biofuels                        2944
Conventional Hydroelectric Power      2452
Biodiesel                             2788
dtype: int64

In [39]:
# Total number of missing values
renewable_data.isna().sum().sum()

25436

In [41]:
# Percentage of missing values
round(renewable_data.isna().sum().sum()/renewable_data.size * 100, 1)

48.8

In [42]:
# Descriptive Analysis
renewable_data.describe()

Unnamed: 0,Year,Month,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Total Renewable Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel
count,3065.0,3065.0,1025.0,1876.0,1735.0,791.0,2452.0,1551.0,1551.0,517.0,2356.0,2452.0,157.0,121.0,613.0,277.0
mean,1998.042414,6.491028,0.50762,1.872933,3.559654,16.59364,45.805511,11.501406,13.786864,28.662234,60.214981,88.590261,8.374064,0.804306,78.786869,10.552895
std,14.747378,3.456934,0.496446,1.602754,7.308859,32.704216,48.270194,8.311028,29.243134,27.588265,67.307908,69.040793,8.656671,1.036827,13.990247,8.638728
min,1973.0,1.0,-0.002,0.138,0.001,0.001,0.002,0.047,0.004,0.45,0.119,0.515,0.126,0.002,48.794,0.064
25%,1985.0,3.0,0.036,0.3925,0.0195,0.0265,6.08525,3.154,0.077,4.739,7.91725,26.60925,2.972,0.198,68.186,1.789
50%,1998.0,6.0,0.38,1.2425,0.327,0.382,24.681,12.74,1.191,10.692,30.3525,76.463,5.221,0.315,77.829,10.464
75%,2011.0,9.0,0.903,3.363,4.65,10.235,84.5945,18.437,5.8425,62.037,119.767,144.774,9.672,0.838,88.259,18.794
max,2024.0,12.0,2.047,5.951,64.04,157.409,183.628,32.875,104.42,75.373,233.2,308.175,38.344,4.101,117.453,27.871


In [None]:
# Biomass Energy and Wood Energy have the highest mean of energy consumption

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [99]:
# Predictive Imputation using KNN
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
data_filled = imputer.fit_transform(renewable_data.iloc[:, 3:])
data_filled = pd.DataFrame(data_filled)
first_2_columns = renewable_data.iloc[:, :2]


In [104]:
last_columns = renewable_data.iloc[:, 3:]
Column_names = list(split_data.columns)
data_filled = data_filled.rename({0: Column_names[0], 1: Column_names[1], 2: Column_names[2], 3: Column_names[3], 4: Column_names[4],
                    5: Column_names[5], 6: Column_names[6], 7: Column_names[7], 8: Column_names[8], 9: Column_names[9],
                    10: Column_names[10], 11: Column_names[11], 12: Column_names[12], 13: Column_names[13]
            }, axis = 1)

Unnamed: 0,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Total Renewable Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel
0,0.021333,0.506667,0.010333,0.010667,0.570000,1.886667,0.571000,4.690333,0.570000,0.570000,2.905000,0.251333,53.098333,0.088667
1,0.084667,0.490000,0.001667,0.003000,0.054000,0.157000,0.540667,4.690333,0.211000,89.223000,2.905000,0.251333,88.522000,0.088667
2,1.040000,0.205667,0.002000,0.035667,98.933000,14.965333,88.035333,5.867667,98.933000,99.973000,0.842667,0.363667,68.390667,10.049333
3,0.084667,0.699667,3.296333,0.027333,30.074000,6.401000,2.770333,4.690333,16.291667,30.074000,8.374064,0.804306,53.098333,10.552895
4,0.507620,1.872933,3.559654,16.593640,45.805511,11.501406,13.786864,28.662234,60.214981,88.590261,8.374064,0.804306,78.786869,10.552895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,0.073000,1.669000,4.267000,0.036000,7.053000,6.233000,2.441000,4.690333,15.728000,21.773000,3.206000,0.453667,52.922667,0.168667
3061,0.593000,4.667000,32.707000,119.265000,15.071000,13.873000,27.584333,45.214667,28.944000,257.661000,2.905000,0.251333,72.078000,0.954333
3062,0.308000,0.356000,0.987000,0.035000,104.878000,14.171000,1.533000,67.742000,188.325000,190.011000,34.380333,3.840333,73.006000,21.080667
3063,0.084667,3.354000,14.897000,0.018667,34.065000,0.689000,2.732000,4.690333,1.515667,52.316000,8.374064,0.804306,56.383000,10.552895


In [105]:
renewable_data_imputated = pd.concat([first_2_columns, data_filled], axis=1)

In [106]:
renewable_data_imputated.head()

Unnamed: 0,Year,Month,Hydroelectric Power,Geothermal Energy,Solar Energy,Wind Energy,Wood Energy,Waste Energy,"Fuel Ethanol, Excluding Denaturant",Biomass Losses and Co-products,Biomass Energy,Total Renewable Energy,Renewable Diesel Fuel,Other Biofuels,Conventional Hydroelectric Power,Biodiesel
0,1973,1,0.021333,0.506667,0.010333,0.010667,0.57,1.886667,0.571,4.690333,0.57,0.57,2.905,0.251333,53.098333,0.088667
1,1973,1,0.084667,0.49,0.001667,0.003,0.054,0.157,0.540667,4.690333,0.211,89.223,2.905,0.251333,88.522,0.088667
2,1973,1,1.04,0.205667,0.002,0.035667,98.933,14.965333,88.035333,5.867667,98.933,99.973,0.842667,0.363667,68.390667,10.049333
3,1973,1,0.084667,0.699667,3.296333,0.027333,30.074,6.401,2.770333,4.690333,16.291667,30.074,8.374064,0.804306,53.098333,10.552895
4,1973,1,0.50762,1.872933,3.559654,16.59364,45.805511,11.501406,13.786864,28.662234,60.214981,88.590261,8.374064,0.804306,78.786869,10.552895


In [47]:
from ydata_profiling import ProfileReport

In [108]:
profile = ProfileReport(renewable_data_imputated, title = 'Renewable Energy Consumption')
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [114]:
# Solar Energy vs Wind Energy, yiesel
X = renewable_data_imputated[['Solar Energy']]
y = renewable_data_imputated['Wind Energy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R-squared:', r2)

Mean Squared Error: 185.82125573301863
R-squared: 0.5143116195215383


In [None]:
# Solar Energy and Wind Energy appear to have a correlation where as more solar energy is consumed,
# more wind energy is consumed.  Due to imputation, some of the data value don't follow the trend
# of the Solar Energy graph and the Wind Energy graph.  The correlation is likely stronger 
# than the R-squared value implies.