<a href="https://colab.research.google.com/github/richarddushime/EDA-and-Prediction-on-Global-Data-and-Sustainable-energy-/blob/main/EDA_and_Prediction_on_GDS_Energy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
anshtanwar_global_data_on_sustainable_energy_path = kagglehub.dataset_download('anshtanwar/global-data-on-sustainable-energy')

print('Data source import complete.')


# Global Data on Sustainable Energy From 2000 to 2020
This is a Deep Exploratory Data Analysis and Model prediction on Global Energy Consumption overtime and I Added a Neural Network Model to predict the Access to electricity overtime

The Dataset is Available on Kaggle and Public

My Focus:
1. Deep Exploratory Data Analysis
2. Energy Consumption prediction using  Regression Model
3. Energy Access Classification: Build a Neural Network Model to predict the Access to electricity over time

Enjoy and Hope it helps

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")


In [None]:
df = pd.read_csv('/kaggle/input/global-data-on-sustainable-energy/global-data-on-sustainable-energy (1).csv')

In [None]:
print("Dataset Preview:")
print(df.head(1))

print("====================================")
print("")
print("====================================")

print(df.tail(1))

In [None]:
# Overview of the dataset
print("Dataset Information:")
print(df.info())

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

In [None]:
# basic statistics
print("Summary Statistics:")
print(df.describe())

In [None]:
# Checking for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

In [None]:
# # Drop duplicates if any
# if duplicates > 0:
#     df = df.drop_duplicates()
#     print("Duplicates removed.")

In [None]:
# Handle missing values
missing_percentage = (df.isnull().sum() / len(df)) * 100
print("Missing Value Percentage by Column:")
print(missing_percentage)

In [None]:
# Drop columns with >60% missing values
# threshold = 60
# cols_to_drop = missing_percentage[missing_percentage > threshold].index
# data = data.drop(columns=cols_to_drop)

# print(f"Columns dropped: {list(cols_to_drop)}")

In [None]:
# Univariate Analysis
# Histogram of numerical features
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Plot histograms
for col in numerical_columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# # Correlation heatmap
numerical_df = df.select_dtypes(include=['float64', 'int64'])

plt.figure(figsize=(12, 8))
corr_matrix = numerical_df.corr()
sns.heatmap(corr_matrix,annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Trend Over Time Access to electricity (% of population)
if 'Year' in df.columns:
    temporal_columns = ['Access to electricity (% of population)',
                        'Renewable energy share in total final energy consumption (%)',
                        'Value_co2_emissions (metric tons per capita)']
    for col in temporal_columns:
        if col in df.columns:
            plt.figure(figsize=(10, 5))
            sns.lineplot(data=df, x='Year', y=col)
            plt.title(f"Trend Over Time: {col}")
            plt.xlabel("Year")
            plt.ylabel(col)
            plt.show()


In [None]:
# Primary Energy Consumption per Capita (kWh/person) over time.

if 'Year' in df.columns and 'Primary energy consumption per capita (kWh/person)' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df, x='Year', y='Primary energy consumption per capita (kWh/person)', errorbar=None)
    plt.title("Trend: Primary Energy Consumption per Capita (2000-2020)")
    plt.xlabel("Year")
    plt.ylabel("Energy Consumption (kWh/person)")
    plt.grid(True)
    plt.show()


In [None]:
if 'Year' in df.columns and 'Financial flows to developing countries (US $)' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df, x='Year', y='Financial flows to developing countries (US $)')
    plt.title("Trend: Financial Flows to Developing Countries for Clean Energy Projects")
    plt.xlabel("Year")
    plt.ylabel("Financial Flows (US $)")
    plt.grid(True)
    plt.show()


In [None]:
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster

# map for Access to Electricity
if 'Latitude' in df.columns and 'Longitude' in df.columns:
    map_data = df.dropna(subset=['Latitude', 'Longitude', 'Access to electricity (% of population)'])

    m = folium.Map(location=[0, 0], zoom_start=2)
    marker_cluster = MarkerCluster().add_to(m)

    for _, row in map_data.iterrows():
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=5,
            color='blue',
            fill=True,
            fill_opacity=0.6,
            popup=f"{row['Entity']}: {row['Access to electricity (% of population)']}%"
        ).add_to(marker_cluster)

    # Display the map inline
    display(m)


In [None]:
# Drop rows with any NaN values or empty rows
df_cleaned = df.dropna(how='any', axis=0)

# Verify the cleaned dataframe
print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape: {df_cleaned.shape}")


**Regression Model (Energy Consumption Prediction)**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Relevant columns for regression (Energy Consumption Prediction)
features = [
    'Access to electricity (% of population)',
    'Renewables (% equivalent primary energy)',
    'gdp_per_capita',
    'Value_co2_emissions_kt_by_country',
    'Year'
]

target_ECP = "Primary energy consumption per capita (kWh/person)"

# Relevant columns for classification (Access to Electricity)
target_AE = 'Access to electricity (% of population)'


In [None]:
# Drop rows with missing target values
Energy_Consumption = df.dropna(subset=features + [target_ECP])
Access_to_electricity = df.dropna(subset=features + [target_AE])

# print(f"Energy_Consumption DataFrame shape: {Energy_Consumption.shape}")
# print(f"Access_to_electricity DataFrame shape: {Access_to_electricity.shape}")

In [None]:
print(df.shape)

In [None]:
# Encode Access to Electricity target into categories
Access_to_electricity['Access_Category'] = pd.cut(
    Access_to_electricity[target_AE],
    bins=[0, 50, 80, 100],
    labels=['Low', 'Medium', 'High']
)

In [None]:
# Features and target for regression
X_reg = Energy_Consumption[features]
y_reg = Energy_Consumption[target_ECP]

# Split data into training and testing sets (80/20 split)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_train_reg_scaled = scaler.fit_transform(X_train_reg)
X_test_reg_scaled = scaler.transform(X_test_reg)

A **random forest** is an ensemble learning method that combines the predictions from multiple decision trees to produce a more accurate and stable prediction. It is a type of supervised learning algorithm that can be used for both classification and regression tasks in this case its regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_reg_scaled, y_train_reg)


In [None]:

# Make predictions on the test set
y_pred_reg = rf_model.predict(X_test_reg_scaled)

# Evaluate the model
mae = mean_absolute_error(y_test_reg, y_pred_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Random Forest Regressor Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2) Score: {r2}")


In [None]:
# Get feature importances on energy consumption prediction
importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
importances_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
})

# Sort the importances in descending order
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importances_df)
plt.title("Feature Importances for Energy Consumption Prediction")
plt.show()


In [None]:
# df.head()

In [None]:
# List of features to plot over the years
features_to_plot = [
    'Access to electricity (% of population)',
    'Access to clean fuels for cooking',
    'Renewables (% equivalent primary energy)',
    'Value_co2_emissions_kt_by_country',
    'Primary energy consumption per capita (kWh/person)',
    'gdp_per_capita'
]

# Aggregate data by year
data_by_year = df.groupby('Year')[features_to_plot].mean().reset_index()

In [None]:
# Set style for plots
sns.set(style="whitegrid")

# Plot each feature over the years
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(20, 20))
axes = axes.flatten()

for i, feature in enumerate(features_to_plot):
    sns.lineplot(data=data_by_year, x='Year', y=feature, ax=axes[i])
    axes[i].set_title(f"Trend of {feature} Over Years", fontsize=14)
    axes[i].set_ylabel(feature, fontsize=12)
    axes[i].set_xlabel("Year", fontsize=12)

# Remove empty subplot
if len(features_to_plot) % 2 != 0:
    fig.delaxes(axes[-2])

plt.tight_layout()
plt.show()


**Access to Electricity **

In [None]:
# Split Access to Electricity
X_clf = Access_to_electricity[features]
y_clf = Access_to_electricity['Access_Category']
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_train_clf = label_encoder.fit_transform(y_train_clf)
y_test_clf = label_encoder.transform(y_test_clf)

# Normalize input features
scaler = StandardScaler()
X_train_clf = scaler.fit_transform(X_train_clf)
X_test_clf = scaler.transform(X_test_clf)

In [None]:
# 3-layer neural network with Dropout for regularization.
# Build the classification model with Dropout layers
clf_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_clf.shape[1],)),
    Dropout(0.3),  # Dropout layer to reduce overfitting
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(3, activation='softmax')  # Output layer for 3 categories
])

# Compile the model with an adjusted learning rate
optimizer = Adam(learning_rate=0.001)
clf_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Add a learning rate scheduler
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    verbose=1,
    min_lr=0.01
)


In [None]:
# Train the model
history_clf = clf_model.fit(
    X_train_clf, y_train_clf,
    epochs=20,
    batch_size=16,
    validation_split=0.2,
    verbose=1,
    callbacks=[lr_scheduler]
)

# Evaluate the model
loss_clf, accuracy_clf = clf_model.evaluate(X_test_clf, y_test_clf, verbose=1)
print(f"Test Accuracy: {accuracy_clf * 100:.2f}%")

# Predict and decode labels
y_pred_clf = clf_model.predict(X_test_clf)
y_pred_labels = label_encoder.inverse_transform(np.argmax(y_pred_clf, axis=1))

# Display predictions
print("Predicted Labels:", y_pred_labels[:10])
print("True Labels:", label_encoder.inverse_transform(y_test_clf[:10]))


In [None]:
# accuracy and loss over epochs
plt.figure(figsize=(12, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(history_clf.history['accuracy'], label='Train Accuracy')
plt.plot(history_clf.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy over Epochs')

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history_clf.history['loss'], label='Train Loss')
plt.plot(history_clf.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss over Epochs')

plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Generate confusion matrix
cm = confusion_matrix(y_test_clf, np.argmax(y_pred_clf, axis=1))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
import shap

# SHAP explanation
explainer = shap.KernelExplainer(clf_model.predict, X_test_clf)
shap_values = explainer.shap_values(X_test_clf[:10])  # Explain for a subset

# Summary plot
shap.summary_plot(shap_values, X_test_clf[:10], feature_names=features)


Access to electricity is the most influential for the models decision

# Final Report: Global Data on Sustainable Energy (2000-2020)

## Overview
This report provides an deep analysis of sustainable energy indicators from 2000 to 2020. Key metrics such as electricity access, renewable energy usage, carbon emissions, and energy intensity are explored to track progress towards Sustainable Development Goal 7.

## Key Observations
1. **Access to Electricity**: Significant progress has been made globally, especially in developing countries.
2. **Clean Cooking Fuels**: Access to clean cooking fuels has improved steadily but remains a challenge in low-income regions.
3. **Renewable Energy**: Adoption of renewable energy has increased, reflecting global efforts to reduce reliance on fossil fuels.
4. **Carbon Emissions**: While carbon emissions have fluctuated, some reduction is observed in later years due to renewable energy growth.
5. **Energy Consumption**: Energy consumption per capita shows an upward trend, aligning with economic and population growth.
