In [13]:
!pip3 install --upgrade nbformat


Collecting nbformat
  Downloading nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting fastjsonschema>=2.15 (from nbformat)
  Downloading fastjsonschema-2.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting jsonschema>=2.6 (from nbformat)
  Downloading jsonschema-4.22.0-py3-none-any.whl.metadata (8.2 kB)
Collecting jsonschema-specifications>=2023.03.6 (from jsonschema>=2.6->nbformat)
  Downloading jsonschema_specifications-2023.12.1-py3-none-any.whl.metadata (3.0 kB)
Collecting referencing>=0.28.4 (from jsonschema>=2.6->nbformat)
  Downloading referencing-0.35.1-py3-none-any.whl.metadata (2.8 kB)
Collecting rpds-py>=0.7.1 (from jsonschema>=2.6->nbformat)
  Downloading rpds_py-0.18.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.1 kB)
Downloading nbformat-5.10.4-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading fastjsonschema-2.19.1-py3-none-any.whl (2

In [2]:
# Import all the necessary libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.io as pio

# <center><b>Analysis on Renewable Energy Consumption data<b></center>

---
# **Table of Contents**
---

1. [**Introduction**](#Section1)<br>
2. [**Problem Statement**](#Section2)<br>
3. [**Installing & Importing Libraries**](#Section3)<br>
  3.1 [**Installing Libraries**](#Section31)<br>
  3.2 [**Upgrading Libraries**](#Section32)<br>
  3.3 [**Importing Libraries**](#Section33)<br>
4. [**Data Acquisition & Description**](#Section4)<br>
5. [**Data Pre-Processing**](#Section5)<br>
6. [**Data Pre-Profiling**](#Section6)<br>
7. [**Data Post-Profiling**](#Section7)<br>
8. [**Exploratory Data Analysis**](#Section8)<br>
9. [**Summarization**](#Section9)</br>
  9.1 [**Conclusion**](#Section91)</br>
  9.2 [**Actionable Insights**](#Section91)</br>

---
<a name = Section1></a>
# **1. Introduction**
---

- Energy production, which is the burning of fossil fuels, accounts for roughly 3/4ths of the global greenhouse gas emissions. Not only is energy production the largest driver of climate change, but the burning of fossil fuels and biomass also comes at a large cost to human health: at least five million deaths are attributed to air pollution each year.

In [3]:
# Read in the data 
df = pd.read_csv('../data/global-energy-substitution.csv')
df.head()

Unnamed: 0,Entity,Code,Year,"Other renewables (TWh, substituted energy)","Biofuels (TWh, substituted energy)","Solar (TWh, substituted energy)","Wind (TWh, substituted energy)","Hydropower (TWh, substituted energy)","Nuclear (TWh, substituted energy)","Gas (TWh, substituted energy)","Oil (TWh, substituted energy)","Coal (TWh, substituted energy)","Traditional biomass (TWh, substituted energy)"
0,World,OWID_WRL,1800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,97.0,5556
1,World,OWID_WRL,1810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.0,5833
2,World,OWID_WRL,1820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,153.0,6111
3,World,OWID_WRL,1830,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,264.0,6389
4,World,OWID_WRL,1840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,356.0,6944


In [8]:
# Should I use any cloud-based services for hosting the data? Ideally, data is hidden AND provided as a view only link

df.shape

(75, 13)

In [3]:
# The first plot Seems like an area chart
# Melt the DataFrame to create a long format suitable for plotting
melted_data = df.melt(id_vars=['Year'], value_vars=['Other renewables (TWh, substituted energy)', 'Biofuels (TWh, substituted energy)', 'Solar (TWh, substituted energy)', 'Wind (TWh, substituted energy)', 'Hydropower (TWh, substituted energy)', 'Nuclear (TWh, substituted energy)', 'Gas (TWh, substituted energy)', 'Oil (TWh, substituted energy)', 'Coal (TWh, substituted energy)', 'Traditional biomass (TWh, substituted energy)'], var_name='Energy Source', value_name='Consumption (TWh)')
melted_data.head()

Unnamed: 0,Year,Energy Source,Consumption (TWh)
0,1800,"Other renewables (TWh, substituted energy)",0.0
1,1810,"Other renewables (TWh, substituted energy)",0.0
2,1820,"Other renewables (TWh, substituted energy)",0.0
3,1830,"Other renewables (TWh, substituted energy)",0.0
4,1840,"Other renewables (TWh, substituted energy)",0.0


In [4]:
melted_data.shape

(750, 3)

In [7]:
fig = px.area(melted_data, x='Year', y='Consumption (TWh)', color='Energy Source')

# Update the layout
# Customize the plot
fig.update_layout(
    title='Global Primary Energy Consumption by Source',
    xaxis_title='Year',
    yaxis_title='Consumption (TWh)',
    legend_title='Energy Source'
)

# Show the plot
fig.show()


# Create the line plot using Plotly Express


In [4]:
hydro_data = pd.read_csv('../data/hydro-share-energy.csv') 
hydro_data.head()

Unnamed: 0,Entity,Code,Year,Hydro (% equivalent primary energy)
0,Africa,,1965,5.740281
1,Africa,,1966,6.113969
2,Africa,,1967,6.31658
3,Africa,,1968,6.994845
4,Africa,,1969,7.943916


In [8]:
hydro_data.shape

(4787, 4)

In [10]:
hydro_yrs = hydro_data['Year'].tolist()
nuclear_en

In [14]:
hydro_yrs

[1965,
 1966,
 1967,
 1968,
 1969,
 1970,
 1971,
 1972,
 1973,
 1974,
 1975,
 1976,
 1977,
 1978,
 1979,
 1980,
 1981,
 1982,
 1983,
 1984,
 1985,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022,
 1965,
 1966,
 1967,
 1968,
 1969,
 1970,
 1971,
 1972,
 1973,
 1974,
 1975,
 1976,
 1977,
 1978,
 1979,
 1980,
 1981,
 1982,
 1983,
 1984,
 1985,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022,
 1965,
 1966,
 1967,
 1968,
 1969,
 1970,
 1971,
 1972,
 1973,
 1974,
 1975,
 1976,
 1977,
 1978,
 1979,
 1980,
 1981,
 1982,
 1983,
 1984,
 1985,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,

In [6]:
nuclear_data = pd.read_csv('../data/nuclear-primary-energy.csv')
nuclear_data.head()

Unnamed: 0,Entity,Code,Year,Nuclear (% equivalent primary energy)
0,Africa,,1965,0.0
1,Africa,,1966,0.0
2,Africa,,1967,0.0
3,Africa,,1968,0.0
4,Africa,,1969,0.0


In [7]:
nuclear_data.shape


(4651, 4)

In [12]:
nuclear_yrs = nuclear_data['Year'].tolist()

In [13]:
for year in hydro_yrs:
    if year not in nuclear_yrs:
        print(year)

In [16]:
# Some countries don't have data for all the yrs. We need to filter out the years that are common to both datasets
# Take out the entity-wide data 
hydro_entity_vals = hydro_data['Entity'].tolist()
nuclear_entity_vals = nuclear_data['Entity'].tolist()

In [18]:
hydro_entity_vals

['Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (EI)',
 'Africa (

In [17]:
for entity in hydro_entity_vals:
    if entity not in nuclear_entity_vals:
        print(entity)

In [21]:
# Manually inspect the entity wise data to check which country doesn't have a particular year's data
missing_nuclear_data = [] 
for entity in hydro_entity_vals:
    df_hydro_subset = hydro_data[hydro_data['Entity'] == entity]
    df_nuclear_subset = nuclear_data[nuclear_data['Entity'] == entity]
    hydro_yrs = df_hydro_subset['Year'].tolist()
    nuclear_yrs = df_nuclear_subset['Year'].tolist()
    for year in hydro_yrs:
        if year not in nuclear_yrs:
            missing_nuclear_data.append((entity, year))

In [25]:
country_miss = set()
for (country, year) in missing_nuclear_data:
    country_miss.add(country)

In [26]:
country_miss

{'Central America (EI)',
 'Eastern Africa (EI)',
 'Middle Africa (EI)',
 'Middle East (EI)',
 'Western Africa (EI)'}