# Countries Related Data:

----------------------------------------------------------------

## Opening the Data:

In [1]:
# import statments

import pandas as pd

In [2]:
# reading the csv files

country_region = pd.read_csv("..\\0.Data\\country related data\\country_region.csv")
power_plants = pd.read_csv("..\\0.Data\\country related data\\powerplants (global) - global_power_plants.csv")
income_classification = pd.read_csv("..\\0.Data\\country related data\\world-bank-income-groups.csv")

## Country Region Data:

### General look:

In [3]:
# getting a random sample to look at:

country_region.sample(5)

Unnamed: 0,country,region
19,Czech Republic,Central Europe
14,Luxembourg,Western Europe
0,Denmark,Northern Europe
12,Portugal,Southern Europe
23,Bulgaria,Eastren Europe


In [4]:
# getting the number of rows and columns

country_region.shape

(27, 2)

In [5]:
# looking some column information

country_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  27 non-null     object
 1   region   27 non-null     object
dtypes: object(2)
memory usage: 564.0+ bytes


### Data Cleaning:

In [6]:
# checking if we have duplicates:

country_region.duplicated().sum()

0

In [7]:
# checking if we have duplicate countries:

country_region['country'].duplicated().sum()

0

In [8]:
# checking if we have any null values:

country_region.isna().sum()

country    0
region     0
dtype: int64

## Power Plants Data:

### General Look:

In [9]:
power_plants.sample(5)

Unnamed: 0,country code,country_long,name of powerplant,capacity in MW,latitude,longitude,primary_fuel,secondary fuel,other_fuel2,other_fuel3,start date,owner of plant,generation_gwh_2021,geolocation_source,estimated_generation_gwh_2021
5381,CHN,China,Chayouhouqi Hongmu Phase 1,20.0,41.15,113.15,Solar,,,,,Inner Mongolia Datang International New Energy...,,,32.3
26833,USA,United States of America,Crystal,28.0,38.5106,-107.6253,Hydro,,,,1978.0,U S Bureau of Reclamation,133.569,U.S. Energy Information Administration,104.47
33614,USA,United States of America,USS JJ Solar CSG,1.0,45.6361,-94.0504,Solar,,,,2019.0,USS JJ Solar LLC,1.826,U.S. Energy Information Administration,
7447,CHN,China,Longtan,6300.0,25.0277,107.0431,Hydro,,,,2009.0,Longtan Hydropower Development Co. Ltd.,,,18392.19
24547,GBR,United Kingdom,Wathegar 2 Wind Farm,18.5,58.4352,-3.2159,Wind,,,,,BayWa r.e.,,UK Renewable Energy Planning Database,


In [10]:
power_plants.shape

(34936, 15)

In [11]:
power_plants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34936 entries, 0 to 34935
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   country code                   34936 non-null  object 
 1   country_long                   34936 non-null  object 
 2   name of powerplant             34936 non-null  object 
 3   capacity in MW                 34936 non-null  float64
 4   latitude                       34936 non-null  float64
 5   longitude                      34936 non-null  float64
 6   primary_fuel                   34936 non-null  object 
 7   secondary fuel                 1944 non-null   object 
 8   other_fuel2                    276 non-null    object 
 9   other_fuel3                    92 non-null     object 
 10  start date                     17447 non-null  float64
 11  owner of plant                 20868 non-null  object 
 12  generation_gwh_2021            9659 non-null  

### Data Cleaning:

In [12]:
power_plants.isna().sum()

country code                         0
country_long                         0
name of powerplant                   0
capacity in MW                       0
latitude                             0
longitude                            0
primary_fuel                         0
secondary fuel                   32992
other_fuel2                      34660
other_fuel3                      34844
start date                       17489
owner of plant                   14068
generation_gwh_2021              25277
geolocation_source                 419
estimated_generation_gwh_2021    18816
dtype: int64

we have alot of null values! we will be dropping these columns as they do not benefit us currently

In [13]:
power_plants.duplicated().sum()

0

### Some Transformations:

I will be merging this dataset with the countries regions one to look at the data for the countries we need

In [14]:
# doing an inner join to keep the countries we need

merged_power_plants = pd.merge(power_plants, country_region, left_on='country_long', right_on='country', how='inner')

In [15]:
merged_power_plants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6825 entries, 0 to 6824
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   country code                   6825 non-null   object 
 1   country_long                   6825 non-null   object 
 2   name of powerplant             6825 non-null   object 
 3   capacity in MW                 6825 non-null   float64
 4   latitude                       6825 non-null   float64
 5   longitude                      6825 non-null   float64
 6   primary_fuel                   6825 non-null   object 
 7   secondary fuel                 88 non-null     object 
 8   other_fuel2                    1 non-null      object 
 9   other_fuel3                    0 non-null      object 
 10  start date                     2155 non-null   float64
 11  owner of plant                 2615 non-null   object 
 12  generation_gwh_2021            0 non-null      f

In [16]:
merged_power_plants.duplicated().sum()

0

In [17]:
merged_power_plants.isna().sum()

country code                        0
country_long                        0
name of powerplant                  0
capacity in MW                      0
latitude                            0
longitude                           0
primary_fuel                        0
secondary fuel                   6737
other_fuel2                      6824
other_fuel3                      6825
start date                       4670
owner of plant                   4210
generation_gwh_2021              6825
geolocation_source                  1
estimated_generation_gwh_2021    2362
country                             0
region                              0
dtype: int64

In [18]:
# dropping the column we won't need

cleaned_power_plants = merged_power_plants.drop(columns=['secondary fuel', 'other_fuel2', 'other_fuel3', 'start date', 'owner of plant', 'generation_gwh_2021', 'estimated_generation_gwh_2021', 'country_long'], axis=1)

In [19]:
# dropping the the null values

cleaned_power_plants = cleaned_power_plants.dropna()

In [20]:
cleaned_power_plants.isna().sum()

country code          0
name of powerplant    0
capacity in MW        0
latitude              0
longitude             0
primary_fuel          0
geolocation_source    0
country               0
region                0
dtype: int64

In [21]:
cleaned_power_plants['country'].value_counts()

country
France            2155
Germany           1309
Spain              829
Portugal           469
Czech Republic     462
Italy              396
Poland             188
Finland            185
Sweden             168
Austria            103
Greece              90
Netherlands         71
Belgium             69
Romania             68
Ireland             59
Denmark             47
Bulgaria            43
Slovakia            30
Croatia             24
Hungary             18
Estonia             17
Slovenia             8
Lithuania            6
Latvia               5
Cyprus               3
Luxembourg           2
Name: count, dtype: int64

### Checking if the dataset will be benefecial to us:

In [22]:
cleaned_power_plants[cleaned_power_plants['country'] == 'France']['primary_fuel'].value_counts()

primary_fuel
Solar             817
Wind              721
Hydro             429
Biomass           148
Nuclear            19
Gas                 9
Oil                 5
Coal                5
Geothermal          1
Wave and Tidal      1
Name: count, dtype: int64

In [23]:
cleaned_power_plants[cleaned_power_plants['country'] == 'Denmark']['primary_fuel'].value_counts()

primary_fuel
Wind          19
Solar         12
Coal          10
Gas            3
Geothermal     1
Biomass        1
Oil            1
Name: count, dtype: int64

In [24]:
cleaned_power_plants[cleaned_power_plants['country'] == 'Latvia']['primary_fuel'].value_counts()

primary_fuel
Hydro    3
Gas      2
Name: count, dtype: int64

In [25]:
cleaned_power_plants[cleaned_power_plants['country'] == 'Belgium']['primary_fuel'].value_counts()

primary_fuel
Gas        20
Wind       12
Hydro      11
Oil         9
Waste       8
Nuclear     3
Solar       3
Biomass     2
Coal        1
Name: count, dtype: int64

In [26]:
cleaned_power_plants[cleaned_power_plants['country'] == 'Bulgaria']['primary_fuel'].value_counts()

primary_fuel
Solar      23
Coal       11
Hydro       7
Nuclear     1
Wind        1
Name: count, dtype: int64

as you can see most of the countries use solar or wind power which are renewable energies that dont cause any pollution, so this dataset is not very beneficial to us

## Income Data:

### General Look:

In [27]:
income_classification.sample(5)

Unnamed: 0,Entity,Code,Year,World Bank's income classification
3365,Ireland,IRL,2023,High-income countries
1057,Burkina Faso,BFA,1990,Low-income countries
5440,Papua New Guinea,PNG,2014,Lower-middle-income countries
1688,Croatia,HRV,2004,Upper-middle-income countries
2759,Greenland,GRL,2009,High-income countries


In [28]:
income_classification.shape

(7661, 4)

In [29]:
income_classification.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7661 entries, 0 to 7660
Data columns (total 4 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Entity                              7661 non-null   object
 1   Code                                7661 non-null   object
 2   Year                                7661 non-null   int64 
 3   World Bank's income classification  7661 non-null   object
dtypes: int64(1), object(3)
memory usage: 239.5+ KB


### Data Cleaning:

In [30]:
income_classification.duplicated().sum()

0

In [31]:
income_classification.isna().sum()

Entity                                0
Code                                  0
Year                                  0
World Bank's income classification    0
dtype: int64

### Some Transformations:

In [32]:
income_merged = country_region.merge(income_classification, left_on='country', right_on='Entity', how='inner')

In [33]:
income_merged = income_merged.drop('Entity', axis=1)

In [34]:
income_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 933 entries, 0 to 932
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   country                             933 non-null    object
 1   region                              933 non-null    object
 2   Code                                933 non-null    object
 3   Year                                933 non-null    int64 
 4   World Bank's income classification  933 non-null    object
dtypes: int64(1), object(4)
memory usage: 36.6+ KB


In [35]:
income_merged.duplicated().sum()

0

In [36]:
income_merged.isna().sum()

country                               0
region                                0
Code                                  0
Year                                  0
World Bank's income classification    0
dtype: int64

In [37]:
# checking the min and max years in the dataset:

print(income_merged.Year.min())
print(income_merged.Year.max())

1987
2023


In [38]:
# keeping the years that we want to use

income_merged = income_merged[(income_merged.Year >= 2014 ) & (income_merged.Year <= 2023)]

In [39]:
print(income_merged.Year.min())
print(income_merged.Year.max())

2014
2023


now since I want to load the data incrementally into the warehouse, I will divide the data based on the year:

In [41]:
income_merged[income_merged['Year'] == 2014].to_csv("..\\0.1.Data Used\\income data\\income_2014")
income_merged[income_merged['Year'] == 2015].to_csv("..\\0.1.Data Used\\income data\\income_2015")
income_merged[income_merged['Year'] == 2016].to_csv("..\\0.1.Data Used\\income data\\income_2016")
income_merged[income_merged['Year'] == 2017].to_csv("..\\0.1.Data Used\\income data\\income_2017")
income_merged[income_merged['Year'] == 2018].to_csv("..\\0.1.Data Used\\income data\\income_2018")
income_merged[income_merged['Year'] == 2019].to_csv("..\\0.1.Data Used\\income data\\income_2019")
income_merged[income_merged['Year'] == 2020].to_csv("..\\0.1.Data Used\\income data\\income_2020")
income_merged[income_merged['Year'] == 2021].to_csv("..\\0.1.Data Used\\income data\\income_2021")
income_merged[income_merged['Year'] == 2022].to_csv("..\\0.1.Data Used\\income data\\income_2022")
income_merged[income_merged['Year'] == 2023].to_csv("..\\0.1.Data Used\\income data\\income_2023")