In [60]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
#import scipy.stats as stats
import scipy.stats as stats
from scipy.stats import shapiro, kstest

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames 


# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [61]:
from src import support_EDA as sp 

## Global Renewable Energy Production (2000-2023)
[Source](https://www.kaggle.com/datasets/ahmedgaitani/global-renewable-energy?resource=download)

Dataset Units: Gwh

Columns:
- Year: The year of data collection (e.g., 2000, 2001, etc.).
- Country: The name of the country.
- SolarEnergy: Annual solar energy production in gigawatt-hours (GWh).
- WindEnergy: Annual wind energy production in gigawatt-hours (GWh).
- HydroEnergy: Annual hydro energy production in gigawatt-hours (GWh).
- OtherRenewableEnergy: Annual energy production from other renewable sources (e.g., geothermal, biomass) in gigawatt-hours (GWh).
- TotalRenewableEnergy: Total annual renewable energy production in gigawatt-hours (GWh).

In [3]:
df_general = pd.read_csv("data/raw/global_renewable_energy_production.csv")
df_general.head()

Unnamed: 0,Year,Country,SolarEnergy,WindEnergy,HydroEnergy,OtherRenewableEnergy,TotalRenewableEnergy
0,2000,USA,437.086107,1435.928598,1544.389701,319.396318,3736.800724
1,2001,USA,240.416776,402.792876,398.742141,439.779266,1481.731059
2,2002,USA,641.003511,1120.494351,334.99364,486.459433,2582.950935
3,2003,USA,849.198377,476.040844,609.102444,132.532029,2066.873694
4,2004,USA,373.818019,882.183361,1034.306532,181.053113,2471.361025


In [4]:
df_general.columns 

Index(['Year', 'Country', 'SolarEnergy', 'WindEnergy', 'HydroEnergy',
       'OtherRenewableEnergy', 'TotalRenewableEnergy'],
      dtype='object')

In [5]:
df_general.columns = ['Year', 'Country', 'Solar Energy', 'Wind Energy', 'Hydro Energy',
       'Other Renewable Energy', 'Total Renewable Energy']

In [6]:
df_general.columns

Index(['Year', 'Country', 'Solar Energy', 'Wind Energy', 'Hydro Energy',
       'Other Renewable Energy', 'Total Renewable Energy'],
      dtype='object')

In [7]:
df_general.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    240 non-null    int64  
 1   Country                 240 non-null    object 
 2   Solar Energy            240 non-null    float64
 3   Wind Energy             240 non-null    float64
 4   Hydro Energy            240 non-null    float64
 5   Other Renewable Energy  240 non-null    float64
 6   Total Renewable Energy  240 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 13.3+ KB


In [8]:
df_general.isna().sum()/df_general.shape[0]*100

Year                      0.0
Country                   0.0
Solar Energy              0.0
Wind Energy               0.0
Hydro Energy              0.0
Other Renewable Energy    0.0
Total Renewable Energy    0.0
dtype: float64

In [9]:
df_general.duplicated().sum()

0

In [10]:
for col in df_general.columns:
    print(col)
    print(df_general[col].unique())
    print('-'*50)

Year
[2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023]
--------------------------------------------------
Country
['USA' 'China' 'India' 'Germany' 'UK' 'France' 'Brazil' 'Canada'
 'Australia' 'Japan']
--------------------------------------------------
Solar Energy
[437.08610696 240.4167764  641.00351057 849.19837672 373.81801866
 650.66760525 510.4629858  633.17311198 158.54643369 374.15239226
 209.83441136 332.90198344 592.03925141 945.54904741 179.64325185
 449.80956072 352.84105872 167.09557931 104.96990541 794.14331202
 876.79308329 379.88408954 898.49146832 784.70654376 570.45954644
 128.28626712 916.80982653 305.91834894 936.72788711 823.30486921
 826.69613965 305.14164629 106.25691748 207.8788306  566.91155957
 966.20256545 356.35644494 146.33087612 230.40538488 704.92199267
 431.00481945 181.26079305 136.6976274  560.88375247 721.84396429
 406.95971595 332.14746494 576.68552052 910.37625145 753.36011098
 6

In [11]:
df_general.to_csv("data/usable/global_renewable_energy_production.csv")


In [38]:
df_general_pivot = sp.pivot_df(df_general, ["Year", "Country"], "Energy type", "Production (GWh)")

df_general_pivot.sample(10)

Unnamed: 0,Year,Country,Energy type,Production (GWh)
605,2005,France,Hydro Energy,826.196948
42,2018,China,Solar Energy,136.697627
920,2008,Australia,Other Renewable Energy,323.021771
215,2023,Australia,Solar Energy,797.974855
1167,2015,Australia,Total Renewable Energy,2487.060428
939,2003,Japan,Other Renewable Energy,149.887473
374,2014,France,Wind Energy,521.552532
925,2013,Australia,Other Renewable Energy,229.175912
372,2012,France,Wind Energy,1324.139089
540,2012,India,Hydro Energy,1855.269463


In [39]:
df_general_pivot.to_csv("data/usable/global_renewable_energy_production_pivot.csv")

## Global primary energy consumption by source (1990-2023)
[Source](https://ourworldindata.org/grapher/global-energy-substitution?time=1990..2023)

- Dataset Units: Gwh
- Consumption

In [16]:
df_countries = pd.read_csv("data/raw/global-energy-substitution.csv")

In [17]:
df_countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 13 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Entity                                         76 non-null     object 
 1   Code                                           76 non-null     object 
 2   Year                                           76 non-null     int64  
 3   Other renewables (TWh, substituted energy)     76 non-null     float64
 4   Biofuels (TWh, substituted energy)             76 non-null     float64
 5   Solar (TWh, substituted energy)                76 non-null     float64
 6   Wind (TWh, substituted energy)                 76 non-null     float64
 7   Hydropower (TWh, substituted energy)           76 non-null     float64
 8   Nuclear (TWh, substituted energy)              76 non-null     float64
 9   Gas (TWh, substituted energy)                  76 non-nu

In [18]:
df_countries.head()

Unnamed: 0,Entity,Code,Year,"Other renewables (TWh, substituted energy)","Biofuels (TWh, substituted energy)","Solar (TWh, substituted energy)","Wind (TWh, substituted energy)","Hydropower (TWh, substituted energy)","Nuclear (TWh, substituted energy)","Gas (TWh, substituted energy)","Oil (TWh, substituted energy)","Coal (TWh, substituted energy)","Traditional biomass (TWh, substituted energy)"
0,World,OWID_WRL,1800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,97.0,5556
1,World,OWID_WRL,1810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.0,5833
2,World,OWID_WRL,1820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,153.0,6111
3,World,OWID_WRL,1830,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,264.0,6389
4,World,OWID_WRL,1840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,356.0,6944


In [19]:
df_countries.isna().sum()/df_countries.shape[0]*100

Entity                                           0.0
Code                                             0.0
Year                                             0.0
Other renewables (TWh, substituted energy)       0.0
Biofuels (TWh, substituted energy)               0.0
Solar (TWh, substituted energy)                  0.0
Wind (TWh, substituted energy)                   0.0
Hydropower (TWh, substituted energy)             0.0
Nuclear (TWh, substituted energy)                0.0
Gas (TWh, substituted energy)                    0.0
Oil (TWh, substituted energy)                    0.0
Coal (TWh, substituted energy)                   0.0
Traditional biomass (TWh, substituted energy)    0.0
dtype: float64

In [20]:
df_countries.duplicated().sum()

0

In [21]:
df_countries['Entity'].unique()

array(['World'], dtype=object)

In [22]:
df_world = df_countries[df_countries['Year'] >= 1990]

In [23]:
df_world['Year'].unique()

array([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022,
       2023], dtype=int64)

Vamos a transformar los datos a formato largo para tener las columnas en filas (y no una columna para cada tipo de energía.)

In [40]:
df_world_pivot = sp.pivot_df(df_world, ["Entity", "Code", "Year"], "Energy type", "Consumption (TWh)")

In [41]:
df_world_pivot.sample(10)

Unnamed: 0,Entity,Code,Year,Energy type,Consumption (TWh)
274,World,OWID_WRL,1992,"Coal (TWh, substituted energy)",25567.281
3,World,OWID_WRL,1993,"Other renewables (TWh, substituted energy)",418.26727
142,World,OWID_WRL,1996,"Hydropower (TWh, substituted energy)",7442.1943
80,World,OWID_WRL,2002,"Solar (TWh, substituted energy)",5.245938
155,World,OWID_WRL,2009,"Hydropower (TWh, substituted energy)",9066.074
77,World,OWID_WRL,1999,"Solar (TWh, substituted energy)",2.676927
63,World,OWID_WRL,2019,"Biofuels (TWh, substituted energy)",1119.6278
279,World,OWID_WRL,1997,"Coal (TWh, substituted energy)",26547.277
19,World,OWID_WRL,2009,"Other renewables (TWh, substituted energy)",1049.248
180,World,OWID_WRL,2000,"Nuclear (TWh, substituted energy)",7322.6826


In [26]:
df_world.to_csv('data/usable/global_energy_consumption.csv')

In [42]:
df_world_pivot.to_csv('data/usable/global_energy_consumption_pivot.csv', index=False)

## Dataset: Renewable Energy World Wide : 1965~2022 
[Source](https://www.kaggle.com/datasets/belayethossainds/renewable-energy-world-wide-19652022/data?select=01+renewable-share-energy.csv)

### `12-solar-energy-consumption.csv`:

Ideas:
- Hacer un ranking de países o continentes que más producen y más consumen (una otra o las dos)
- Electricity from solar (TWh): potencia consumida que se ha generado a partir de energía solar.
- `definición`: energía de origen solar consumida al año. Hay dos tecnologías para energía solar: (1) mayoritaria: fotovotaica que es con paneles solares y (2) termosolar: mucho menos utilizada.


In [28]:
df_solar_consump = pd.read_csv("data/raw/12-solar-energy-consumption.csv")

df_solar_consump.head()

Unnamed: 0,Entity,Code,Year,Electricity from solar (TWh)
0,Afghanistan,AFG,2000,0.0
1,Afghanistan,AFG,2001,0.0
2,Afghanistan,AFG,2002,0.0
3,Afghanistan,AFG,2003,0.0
4,Afghanistan,AFG,2004,0.0


In [29]:
df_solar_consump.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8683 entries, 0 to 8682
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Entity                        8683 non-null   object 
 1   Code                          7227 non-null   object 
 2   Year                          8683 non-null   int64  
 3   Electricity from solar (TWh)  8683 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 271.5+ KB


In [30]:
df_solar_consump.isna().sum()/df_solar_consump.shape[0]*100

Entity                           0.000000
Code                            16.768398
Year                             0.000000
Electricity from solar (TWh)     0.000000
dtype: float64

In [31]:
df_solar_consump['Entity'][df_solar_consump['Code'].isna()].unique()

array(['Africa', 'Africa (BP)', 'Africa (Ember)', 'Asia', 'Asia (Ember)',
       'Asia Pacific (BP)', 'CIS (BP)', 'Central America (BP)',
       'Eastern Africa (BP)', 'Europe', 'Europe (BP)', 'Europe (Ember)',
       'European Union (27)', 'European Union (27) (Ember)',
       'G20 (Ember)', 'G7 (Ember)', 'High-income countries',
       'Latin America and Caribbean (Ember)', 'Low-income countries',
       'Lower-middle-income countries', 'Middle Africa (BP)',
       'Middle East (BP)', 'Non-OECD (BP)', 'North America',
       'North America (BP)', 'North America (Ember)', 'OECD (BP)',
       'OECD (Ember)', 'Oceania', 'Oceania (Ember)', 'South America',
       'South and Central America (BP)', 'Upper-middle-income countries',
       'Western Africa (BP)'], dtype=object)

--> no nos cargamos datos, por si acaso, y ya luego filtraremos en Tableau

In [32]:
df_solar_consump[df_solar_consump["Code"].notna()].sample(10)

Unnamed: 0,Entity,Code,Year,Electricity from solar (TWh)
1890,Cyprus,CYP,1998,0.0
2344,Eritrea,ERI,2016,0.0
2085,Dominica,DMA,2010,0.0
7347,Sri Lanka,LKA,1969,0.0
275,Angola,AGO,2019,0.02
2073,Djibouti,DJI,2020,0.0
213,Algeria,DZA,2001,0.0
8082,United Arab Emirates,ARE,2000,0.0
1821,Croatia,HRV,2009,0.0
7320,Spain,ESP,2000,0.01


In [33]:
#vemos los que más consumen:
df_solar_consump[(df_solar_consump["Code"].notna()) & (df_solar_consump["Entity"] != "World") & (df_solar_consump["Year"] == 2021)].sort_values('Electricity from solar (TWh)', ascending=False).head(20)

Unnamed: 0,Entity,Code,Year,Electricity from solar (TWh)
1634,China,CHN,2021,327.0
8217,United States,USA,2021,164.42
4048,Japan,JPN,2021,88.7
3625,India,IND,2021,68.31
3026,Germany,DEU,2021,49.34
593,Australia,AUS,2021,28.04
7341,Spain,ESP,2021,27.1
8486,Vietnam,VNM,2021,25.77
3968,Italy,ITA,2021,25.04
7223,South Korea,KOR,2021,23.95


In [34]:
df_solar_consump['Electricity from solar (TWh)'].describe()

count    8683.000000
mean        5.277942
std        40.098862
min         0.000000
25%         0.000000
50%         0.000000
75%         0.010000
max      1040.500000
Name: Electricity from solar (TWh), dtype: float64

In [35]:
#creamos columna GWh para poder compararla luego con la capacity que está en esta unidad
df_solar_consump['Electricity from solar (GWh)'] = df_solar_consump['Electricity from solar (TWh)']/1000

In [43]:
df_solar_consump.head()

Unnamed: 0,Entity,Code,Year,Electricity from solar (TWh),Electricity from solar (GWh)
0,Afghanistan,AFG,2000,0.0,0.0
1,Afghanistan,AFG,2001,0.0,0.0
2,Afghanistan,AFG,2002,0.0,0.0
3,Afghanistan,AFG,2003,0.0,0.0
4,Afghanistan,AFG,2004,0.0,0.0


In [36]:
df_solar_consump.duplicated().sum()

0

In [37]:
df_solar_consump.isnull().sum()/df_solar_consump.shape[0]*100

Entity                           0.000000
Code                            16.768398
Year                             0.000000
Electricity from solar (TWh)     0.000000
Electricity from solar (GWh)     0.000000
dtype: float64

In [26]:
df_solar_consump.to_csv('data/usable/territorial_solar_consumption.csv')

In [46]:
df_solar_consump_pivot = sp.pivot_df(df_solar_consump, ['Entity', 'Code', 'Year'], 'Units', 'Electricity from solar')

df_solar_consump_pivot.sample(20)

Unnamed: 0,Entity,Code,Year,Units,Electricity from solar
4274,Latin America and Caribbean (Ember),,2005,Electricity from solar (TWh),0.02
10792,Dominican Republic,DOM,2012,Electricity from solar (GWh),0.0
8065,United Arab Emirates,ARE,1983,Electricity from solar (TWh),0.0
15044,Puerto Rico,PRI,2006,Electricity from solar (GWh),0.0
6620,Saint Lucia,LCA,2018,Electricity from solar (TWh),0.0
3948,Italy,ITA,2001,Electricity from solar (TWh),0.02
17149,Vietnam,VNM,2001,Electricity from solar (GWh),0.0
8131,United Kingdom,GBR,1992,Electricity from solar (TWh),0.0
7779,Togo,TGO,2021,Electricity from solar (TWh),0.02
6984,Slovenia,SVN,1992,Electricity from solar (TWh),0.0


In [47]:
df_solar_consump_pivot.to_csv('data/usable/territorial_solar_consumption_pivot.csv')

### `13-installed-solar-PV-capacity.csv`:

Ideas:
- Solar Capacity: no tenemos ni idea en qué unidades está esto. En principio decmos que son GWp (Giga Watt pico). Al estar por países/zonas, debería ser así, ya que los MW se quedarían cortos.


- `definición`: potencia solar fotovoltaica (no hay termosolar) instalada por territorio. No puede ser GWh , le sobra la hora porque estamos ante una medida de potencia no una medida de energía (la energía si lleva la hora).

In [48]:
df_solar_installed = pd.read_csv('data/raw/13-installed-solar-PV-capacity.csv')

In [49]:
df_solar_installed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1659 entries, 0 to 1658
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Entity          1659 non-null   object 
 1   Code            1243 non-null   object 
 2   Year            1659 non-null   int64  
 3   Solar Capacity  1659 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 52.0+ KB


In [50]:
#mirando los datos de españa de esta tabla y comparándolos con los datos de España del 2021, 
# llegamos a la conclusion de que esta medida son GW y que en en el 2021 en la fuente contrastada son 14,8 GW
df_solar_installed[df_solar_installed['Entity'] == 'Spain'].tail()

Unnamed: 0,Entity,Code,Year,Solar Capacity
1346,Spain,ESP,2017,4.723
1347,Spain,ESP,2018,4.763607
1348,Spain,ESP,2019,8.839253
1349,Spain,ESP,2020,10.285477
1350,Spain,ESP,2021,13.648477


In [51]:
#vemos los que más capacidad solar tienen instalada:
df_solar_installed[(df_solar_installed["Code"].notna()) & (df_solar_installed["Entity"] != "World") & (df_solar_installed["Year"] == 2021)].sort_values('Solar Capacity', ascending=False).head(20)

Unnamed: 0,Entity,Code,Year,Solar Capacity
389,China,CHN,2021,306.4028
1584,United States,USA,2021,93.713016
779,Japan,JPN,2021,74.191
571,Germany,DEU,2021,58.459
701,India,IND,2021,49.341508
753,Italy,ITA,2021,22.69204
181,Australia,AUS,2021,19.074
1298,South Korea,KOR,2021,18.160574
1632,Vietnam,VNM,2021,16.66049
545,France,FRA,2021,14.709225


In [52]:
df_solar_installed.head()

Unnamed: 0,Entity,Code,Year,Solar Capacity
0,Africa,,1996,0.001112
1,Africa,,1997,0.003137
2,Africa,,1998,0.005162
3,Africa,,1999,0.008199
4,Africa,,2000,0.010927


In [53]:
df_solar_installed.isna().sum()/df_solar_installed.shape[0]*100

Entity             0.000000
Code              25.075347
Year               0.000000
Solar Capacity     0.000000
dtype: float64

In [55]:
df_solar_installed.rename(columns = {'Solar Capacity': 'Solar Capacity (GW)'}, inplace = True)

In [56]:
df_solar_installed.to_csv('data/usable/territorial_solar_capacity.csv')

----> hacer en tableau: `Merge consumption// capacity solar`:

- potencia real consumida entre el máximo de capacidad solar de cada territorio.

- Para que tengan ambas unidades coincidentes: la potencia hay que dividira entre 1000 para sacar la proporción.

### `15-share-electricity-solar.csv`:

- Solar (% electricity): porcentaje sobre total de electricidad producida. Me comenta mi padre que podría ser de electricidad consumida en lugar de producida, pero que normalmente cuando se habla en genérico se entiende que es `producida o generada` y ya luego la consumida puede variar un poco en función de lo exportado/importado, pero suele ir bastante parejo con lo producido.

In [57]:
df_elec_solar = pd.read_csv('data/raw/15-share-electricity-solar.csv')

In [58]:
#vemos los que más % de electricidad solar producida son:
df_elec_solar[(df_elec_solar["Code"].notna()) & (df_elec_solar["Entity"] != "World") & (df_elec_solar["Year"] == 2021)].sort_values('Solar (% electricity)', ascending=False).head(20)

Unnamed: 0,Entity,Code,Year,Solar (% electricity)
1418,Cook Islands,COK,2021,40.0
4199,Namibia,NAM,2021,24.203821
4843,Palestine,PSE,2021,23.255814
6826,Yemen,YEM,2021,17.045456
1852,El Salvador,SLV,2021,16.920732
3215,Jordan,JOR,2021,15.996345
3665,Luxembourg,LUX,2021,14.634146
6646,Vanuatu,VUT,2021,14.285714
1278,Chile,CHL,2021,13.237639
6196,Tonga,TON,2021,12.5


In [49]:
#por qué no sale china???
df_elec_solar[df_elec_solar['Entity'] == 'China'].tail()

Unnamed: 0,Entity,Code,Year,Solar (% electricity)
1311,China,CHN,2017,1.796208
1312,China,CHN,2018,2.484341
1313,China,CHN,2019,3.002081
1314,China,CHN,2020,3.375448
1315,China,CHN,2021,3.854305


In [62]:
#Buscamos España igualmente, para hacer la comparación con datos reales de otras fuentes, y es correcto.
df_elec_solar[df_elec_solar['Entity'] == 'Spain'].tail()

Unnamed: 0,Entity,Code,Year,Solar (% electricity)
5833,Spain,ESP,2018,4.685891
5834,Spain,ESP,2019,5.574218
5835,Spain,ESP,2020,7.958264
5836,Spain,ESP,2021,10.007016
5837,Spain,ESP,2022,11.505917


In [63]:
df_elec_solar.head()

Unnamed: 0,Entity,Code,Year,Solar (% electricity)
0,Afghanistan,AFG,2000,0.0
1,Afghanistan,AFG,2001,0.0
2,Afghanistan,AFG,2002,0.0
3,Afghanistan,AFG,2003,0.0
4,Afghanistan,AFG,2004,0.0


In [64]:
df_elec_solar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6871 entries, 0 to 6870
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Entity                 6871 non-null   object 
 1   Code                   5781 non-null   object 
 2   Year                   6871 non-null   int64  
 3   Solar (% electricity)  6871 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 214.8+ KB


In [65]:
df_elec_solar.isna().sum()/df_elec_solar.shape[0]*100

Entity                    0.000000
Code                     15.863775
Year                      0.000000
Solar (% electricity)     0.000000
dtype: float64

In [66]:
df_elec_solar.duplicated().sum()

0

In [67]:
df_elec_solar.to_csv('data/usable/territorial_solar_electricity.csv')

### `08-wind-generation.csv`:
 

- `definición`: energía eólica generada en un año y en un territorio específico

In [68]:
df_generation_wind = pd.read_csv('data/raw/08-wind-generation.csv')

In [69]:
#vemos los que más % de electricidad solar producida son:
df_generation_wind[(df_generation_wind["Code"].notna()) & (df_generation_wind["Entity"] != "World") & (df_generation_wind["Year"] == 2021)].sort_values('Electricity from wind (TWh)', ascending=False).head(10)

Unnamed: 0,Entity,Code,Year,Electricity from wind (TWh)
1646,China,CHN,2021,655.6
8210,United States,USA,2021,378.2
3038,Germany,DEU,2021,114.65
1113,Brazil,BRA,2021,71.5
3637,India,IND,2021,68.09
8153,United Kingdom,GBR,2021,65.02
7334,Spain,ESP,2021,62.06
2836,France,FRA,2021,36.83
1393,Canada,CAN,2021,35.21
7930,Turkey,TUR,2021,31.21


In [70]:
#China??:
df_generation_wind[df_generation_wind['Entity'] == "China"].tail()

Unnamed: 0,Entity,Code,Year,Electricity from wind (TWh)
1642,China,CHN,2017,304.6
1643,China,CHN,2018,365.8
1644,China,CHN,2019,405.3
1645,China,CHN,2020,466.5
1646,China,CHN,2021,655.6


In [71]:
df_generation_wind.head()

Unnamed: 0,Entity,Code,Year,Electricity from wind (TWh)
0,Afghanistan,AFG,2000,0.0
1,Afghanistan,AFG,2001,0.0
2,Afghanistan,AFG,2002,0.0
3,Afghanistan,AFG,2003,0.0
4,Afghanistan,AFG,2004,0.0


In [72]:
df_generation_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8676 entries, 0 to 8675
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Entity                       8676 non-null   object 
 1   Code                         7217 non-null   object 
 2   Year                         8676 non-null   int64  
 3   Electricity from wind (TWh)  8676 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 271.3+ KB


In [73]:
df_generation_wind.isna().sum()/df_generation_wind.shape[0]*100

Entity                          0.000000
Code                           16.816505
Year                            0.000000
Electricity from wind (TWh)     0.000000
dtype: float64

In [74]:
df_generation_wind['Year'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021,
       1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
       1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
       1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
       1998, 1999, 2022], dtype=int64)

In [47]:
df_generation_wind.duplicated().sum()

0

In [75]:
df_generation_wind[df_generation_wind['Entity'] == "Spain"].tail()

Unnamed: 0,Entity,Code,Year,Electricity from wind (TWh)
7331,Spain,ESP,2018,50.9
7332,Spain,ESP,2019,55.65
7333,Spain,ESP,2020,56.44
7334,Spain,ESP,2021,62.06
7335,Spain,ESP,2022,61.85


In [76]:
df_generation_wind.to_csv('data/usable/territorial_wind_generation.csv')

Mismo número de nulos en Code que en los de energía solar. Los datos de España y las unidades cuadran con otras fuentes.

### `11-share-electricity-wind.csv`:
- Wind (% electricity): porcentaje total de viento sobre total de electricidad producida


In [77]:
df_elec_wind = pd.read_csv('data/raw/11-share-electricity-wind.csv')

In [78]:
df_elec_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6871 entries, 0 to 6870
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Entity                6871 non-null   object 
 1   Code                  5781 non-null   object 
 2   Year                  6871 non-null   int64  
 3   Wind (% electricity)  6871 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 214.8+ KB


In [79]:
df_elec_wind.head()

Unnamed: 0,Entity,Code,Year,Wind (% electricity)
0,Afghanistan,AFG,2000,0.0
1,Afghanistan,AFG,2001,0.0
2,Afghanistan,AFG,2002,0.0
3,Afghanistan,AFG,2003,0.0
4,Afghanistan,AFG,2004,0.0


In [80]:
df_elec_wind.duplicated().sum()

0

In [81]:
df_elec_wind.isna().sum()/df_elec_wind.shape[0]*100

Entity                   0.000000
Code                    15.863775
Year                     0.000000
Wind (% electricity)     0.000000
dtype: float64

In [82]:
df_elec_wind.to_csv('data/usable/territorial_wind_electricity.csv')

In [83]:
df_elec_wind[(df_elec_wind['Entity'] == "Spain") & (df_elec_wind['Year'] == 2022)]

Unnamed: 0,Entity,Code,Year,Wind (% electricity)
5837,Spain,ESP,2022,21.716232


Los datos cuadran con otras fuentes.

Podría ser interesante ver la variación de generación en el tiempo (para esta y otras) sólo para los países que consideremos "clave" en la producción de energías renovables.

### `09-cumulative-installed-wind-energy-capacity-gigawatts`:

- `definición`: potencia eólica instalada por territorio en GWp.

- Máximo que se puede generar a cierta fecha: es decir, total de potencia instalada a lo largo del tiempo.
- preguntas de viento : cuánto produce, capacidad de producir.. nos centramos más en la capacidad y producción que en el consumo, porque no tenemos ese dato.

In [84]:
df_installed_wind = pd.read_csv('data/raw/09-cumulative-installed-wind-energy-capacity-gigawatts.csv')

In [85]:
df_installed_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1540 entries, 0 to 1539
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Entity         1540 non-null   object 
 1   Code           1143 non-null   object 
 2   Year           1540 non-null   int64  
 3   Wind Capacity  1540 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 48.3+ KB


In [86]:
df_installed_wind.isna().sum()/df_installed_wind.shape[0]*100

Entity            0.000000
Code             25.779221
Year              0.000000
Wind Capacity     0.000000
dtype: float64

In [87]:
df_installed_wind.duplicated().sum()

0

In [94]:
df_installed_wind.rename(columns={'Wind Capacity' : 'Wind Capacity (GWp)'}, inplace=True)

In [95]:
#Buscamos España igualmente, para hacer la comparación con datos reales:.
df_installed_wind[(df_installed_wind['Entity'] == 'Spain')]

Unnamed: 0,Entity,Code,Year,Wind Capacity (GWp)
1241,Spain,ESP,1997,0.512
1242,Spain,ESP,1998,0.723
1243,Spain,ESP,1999,1.408
1244,Spain,ESP,2000,2.206
1245,Spain,ESP,2001,3.397
1246,Spain,ESP,2002,4.891
1247,Spain,ESP,2003,5.945
1248,Spain,ESP,2004,8.317
1249,Spain,ESP,2005,9.918
1250,Spain,ESP,2006,11.722


In [96]:
df_installed_wind.head()

Unnamed: 0,Entity,Code,Year,Wind Capacity (GWp)
0,Africa,,1997,0.006
1,Africa,,1998,0.01
2,Africa,,1999,0.064
3,Africa,,2000,0.13905
4,Africa,,2001,0.13905


In [97]:
df_installed_wind.to_csv('data/usable/territorial_wind_cumulative_capacity.csv')

Parece que los datos y las unidades para España cuadran con otras fuentes. Los nulos de code parece que tienen sentido, no existe el código para áreas que no sean países.

## GDP (Gross Domestic Product) (1990-2023)
[Source](https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.PCAP.CD&country=#)

`GDP (Gross Domestic Product)`: GDP per capita is gross domestic product divided by midyear population. GDP is the sum of gross value added by all resident producers in the economy plus any product taxes and minus any subsidies not included in the value of the products. It is calculated without making deductions for depreciation of fabricated assets or for depletion and degradation of natural resources. Data are in current U.S. dollars.

Params:

- `time`: 1990 to 2023

- `country`: all

- `series`: NY.GDP.PCAP.CD = GDP per capita (current USD)

In [63]:
df_gdp = pd.read_csv("data/raw/gross-domestic-product.csv")

In [64]:
df_gdp.head(1)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],1991 [YR1991],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,GDP per capita (current US$),NY.GDP.PCAP.CD,Afghanistan,AFG,..,..,..,174.930991430166,138.706821676113,178.954088379235,198.871115728727,221.763653873013,254.184249149336,274.218553561575,376.223152003876,625.054941813938,565.569730408751,522.082215583898,525.469770891619,491.337221382603,496.6025042585,510.787063366811,356.496214115892,357.261152798144,415.707417059086


In [65]:
#Eliminamos columnas irrelevantes:
df_gdp.drop(columns = ['Series Name', 'Series Code'], inplace = True)

In [66]:
df_gdp.head(1)

Unnamed: 0,Country Name,Country Code,1990 [YR1990],1991 [YR1991],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,Afghanistan,AFG,..,..,..,174.930991430166,138.706821676113,178.954088379235,198.871115728727,221.763653873013,254.184249149336,274.218553561575,376.223152003876,625.054941813938,565.569730408751,522.082215583898,525.469770891619,491.337221382603,496.6025042585,510.787063366811,356.496214115892,357.261152798144,415.707417059086


In [67]:
#renombramos las columnas:
df_gdp.columns  = ['Country Name', 'Country Code', '1990', '1991',
       '1999', '2000', '2001', '2002',
       '2003', '2004', '2005', '2006',
       '2007', '2014', '2015', '2016',
       '2017', '2018', '2019', '2020',
       '2021', '2022', '2023']

In [68]:
df_gdp.head(1)

Unnamed: 0,Country Name,Country Code,1990,1991,1999,2000,2001,2002,2003,2004,2005,2006,2007,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Afghanistan,AFG,..,..,..,174.930991430166,138.706821676113,178.954088379235,198.871115728727,221.763653873013,254.184249149336,274.218553561575,376.223152003876,625.054941813938,565.569730408751,522.082215583898,525.469770891619,491.337221382603,496.6025042585,510.787063366811,356.496214115892,357.261152798144,415.707417059086


In [69]:
df_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Country Name  266 non-null    object
 1   Country Code  266 non-null    object
 2   1990          266 non-null    object
 3   1991          266 non-null    object
 4   1999          266 non-null    object
 5   2000          266 non-null    object
 6   2001          266 non-null    object
 7   2002          266 non-null    object
 8   2003          266 non-null    object
 9   2004          266 non-null    object
 10  2005          266 non-null    object
 11  2006          266 non-null    object
 12  2007          266 non-null    object
 13  2014          266 non-null    object
 14  2015          266 non-null    object
 15  2016          266 non-null    object
 16  2017          266 non-null    object
 17  2018          266 non-null    object
 18  2019          266 non-null    object
 19  2020    

In [70]:
lista_col =['1990', '1991', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2014', '2015', '2016',
       '2017', '2018', '2019', '2020', '2021', '2022', '2023']

for col in lista_col:
    df_gdp[col]= pd.to_numeric(df_gdp[col].astype(str).str.strip(), errors='coerce')

In [71]:
#hay duplicados, los eliminamos:
df_gdp.duplicated().sum()

4

In [72]:
df_gdp = df_gdp.drop_duplicates()

In [73]:
#Comprobamos si ahora hay duplicados:
df_gdp.duplicated().sum()

0

In [74]:
#Gestion de nulos:
df_gdp.isnull().sum()/df_gdp.shape[0]*100

Country Name     0.374532
Country Code     0.374532
1990            10.112360
1991             9.737828
1999             6.367041
2000             5.617978
2001             5.243446
2002             3.745318
2003             3.745318
2004             3.745318
2005             3.745318
2006             3.370787
2007             3.370787
2014             2.247191
2015             2.996255
2016             3.370787
2017             3.370787
2018             3.370787
2019             2.996255
2020             3.370787
2021             3.370787
2022             4.494382
2023             8.988764
dtype: float64

In [None]:
df_gdp_trans = sp.pivot_df(df_gdp, ['Country Name', 'Country Code'], 'Year', 'GDP($)')

In [30]:
df_gdp.to_csv('data/usable/gross-domestic-product.csv', index=False)

In [78]:
df_gdp_trans.to_csv('data/usable/gross-domestic-product-trans.csv', index=False)