# Import Library

In [7]:
import pandas as pd
import sqlite3
import requests
from bs4 import BeautifulSoup

# Extract Data

In [59]:
# From csv
electricity_access_precent = pd.read_csv('DataSource/electricity_access_precent.csv', quotechar='"')

rural_population_percent = pd.read_csv("DataSource/rural_population_percent.csv", quotechar='"')

gdp_data = pd.read_csv('DataSource/gdp_data.csv', quotechar='"')

mystery = pd.read_csv('DataSource/mystery.csv', quotechar='"')

population_data_from_csv = pd.read_csv('DataSource/population_data.csv', quotechar='"')

projects_data = pd.read_csv("DataSource/projects_data.csv", quotechar='"')


# From json
population_data_from_json = pd.read_json('DataSource/population_data.json')


# From db
conn = sqlite3.connect('DataSource/population_data.db')
population_data_from_db = pd.read_sql_query("SELECT * FROM population_data;", conn)
conn.close()


# From xml
with open("DataSource/population_data.xml", "r") as f:
    xml_data = f.read()

soup = BeautifulSoup(xml_data, "xml")
data = []
for record in soup.find_all("record"):
    record_data = {}
    for field in record.find_all("field"):
        name = field.get("name")
        value = field.text
        record_data[name] = value
    data.append(record_data)

population_data_from_xml = pd.DataFrame(data)


# from API
url = f"https://api.worldbank.org/v2/countries/all/indicators/SP.POP.TOTL/?format=json&per_page=1000"
data = []
page = 1
total_pages = None

while total_pages is None or page <= total_pages:
    response = requests.get(url, params={"page": page})
    response_data = response.json()
    if total_pages is None:
        total_pages = response_data[0]['pages']
    data.extend(response_data[1])
    page += 1

df_worldbank = pd.json_normalize(data)

  projects_data = pd.read_csv("DataSource/projects_data.csv", quotechar='"')


# Transfrom

#### - Cleaning data electricity_access_percent & rural_population_percent

In [60]:
### cleaning rural_population_percent
index_indicator_code = rural_population_percent.columns.get_loc('Indicator Code')
column_after_index = rural_population_percent.columns[index_indicator_code + 1:]
empty_row = rural_population_percent[rural_population_percent[column_after_index].isnull().all(axis=1)]
delete_country_names = empty_row.loc[:, 'Country Name'].tolist()
print(delete_country_names)
rural_population_percent = rural_population_percent.drop(empty_row.index)

rural_population_percent.drop(columns=["Unnamed: 62"], inplace=True)
rural_population_percent = rural_population_percent.fillna(0.000)
rural_population_percent.drop(columns=["Indicator Name","Indicator Code"], inplace=True)
rural_population_percent = rural_population_percent.rename(columns={'Country Name':'country_name','Country Code':'country_code'})
rural_population_percent = rural_population_percent.melt(id_vars=["country_name","country_code"], var_name='year', value_name='rural_population_percent')

### cleaning electricity_access_percent
for country_name in delete_country_names:
    electricity_access_precent = electricity_access_precent[electricity_access_precent['Country Name'] != country_name]
    
electricity_access_precent.drop(columns=["Unnamed: 62"], inplace=True)
electricity_access_precent = electricity_access_precent.fillna(0.000)
electricity_access_precent.drop(columns=["Indicator Name","Indicator Code"], inplace=True)
electricity_access_precent = electricity_access_precent.rename(columns={'Country Name':'country_name','Country Code':'country_code'})
electricity_access_precent = electricity_access_precent.melt(id_vars=["country_name","country_code"], var_name='year', value_name='electricity_access_percent')

['Not classified', 'St. Martin (French part)', 'Kosovo']


#### - Merge electricity_access_percent & rural_population_percent

In [62]:
combine_electricity_rural = pd.merge(rural_population_percent,electricity_access_precent, how='left', on=["country_name", "country_code", "year"])
combine_electricity_rural

Unnamed: 0,country_name,country_code,year,rural_population_percent,electricity_access_percent
0,Aruba,ABW,1960,49.224,0.0
1,Afghanistan,AFG,1960,91.779,0.0
2,Angola,AGO,1960,89.565,0.0
3,Albania,ALB,1960,69.295,0.0
4,Andorra,AND,1960,41.550,0.0
...,...,...,...,...,...
15133,Samoa,WSM,2017,81.170,0.0
15134,"Yemen, Rep.",YEM,2017,64.231,0.0
15135,South Africa,ZAF,2017,34.218,0.0
15136,Zambia,ZMB,2017,58.160,0.0


#### - Cleaning all data population (csv,json,db,xml)