In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime 

In [3]:
url= 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-PY0221EN-Coursera/labs/v2/exchange_rate.csv'

In [4]:
ex_rate = pd.read_csv(url)
ex_rate

Unnamed: 0,Currency,Rate
0,EUR,0.93
1,GBP,0.8
2,INR,82.95


In [5]:
ex_rate.to_csv('exchange_rate.csv')

# Task 1: Logging function

In [6]:
url = 'https://web.archive.org/web/20230908091635/https://en.wikipedia.org/wiki/List_of_largest_banks'
table_attribs = ["Name", "MC_USD_Billion"]
db_name = 'Banks.db'
table_name = 'Largest_banks'
csv_path = './Largest_banks_data.csv'

In [7]:
def log_progress(message):
    ''' This function logs the mentioned message at a given stage of the 
    code execution to a log file. Function returns nothing.'''

    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open("./code_log.txt","a") as f: 
        f.write(timestamp + ' : ' + message + '\n')    

In [8]:
log_progress('Preliminaries complete. Initiating ETL process')

# Task 2 : Extraction of data

In [37]:
df = pd.read_html(url)[0]
df

Unnamed: 0,Rank,Bank name,Market cap (US$ billion)
0,1,JPMorgan Chase,432.92
1,2,Bank of America,231.52
2,3,Industrial and Commercial Bank of China,194.56
3,4,Agricultural Bank of China,160.68
4,5,HDFC Bank,157.91
5,6,Wells Fargo,155.87
6,7,HSBC Holdings PLC,148.9
7,8,Morgan Stanley,140.83
8,9,China Construction Bank,139.82
9,10,Bank of China,136.81


In [38]:
def clean_data(df):
    # Drop column: 'Rank'
    df = df.drop(columns=['Rank'])
    # Rename column 'Bank name' to 'Name'
    df = df.rename(columns={'Bank name': 'Name'})
    # Rename column 'Market cap (US$ billion)' to 'MC_USD_Billion'
    df = df.rename(columns={'Market cap (US$ billion)': 'MC_USD_Billion'})
    return df

df_clean = clean_data(df.copy())
df_clean

Unnamed: 0,Name,MC_USD_Billion
0,JPMorgan Chase,432.92
1,Bank of America,231.52
2,Industrial and Commercial Bank of China,194.56
3,Agricultural Bank of China,160.68
4,HDFC Bank,157.91
5,Wells Fargo,155.87
6,HSBC Holdings PLC,148.9
7,Morgan Stanley,140.83
8,China Construction Bank,139.82
9,Bank of China,136.81


In [39]:
def extract(url):
    ''' The purpose of this function is to extract the required
    information from the website and save it to a dataframe. The
    function returns the dataframe for further processing. '''
    # Extract table 1
    df = pd.read_html(url)[0]
    # Drop column: 'Rank'
    df = df.drop(columns=['Rank'])
    # Rename column 'Bank name' to 'Name'
    df = df.rename(columns={'Bank name': 'Name'})
    # Rename column 'Market cap (US$ billion)' to 'MC_USD_Billion'
    df = df.rename(columns={'Market cap (US$ billion)': 'MC_USD_Billion'})
    
    return df

In [40]:
df = extract(url)
df

Unnamed: 0,Name,MC_USD_Billion
0,JPMorgan Chase,432.92
1,Bank of America,231.52
2,Industrial and Commercial Bank of China,194.56
3,Agricultural Bank of China,160.68
4,HDFC Bank,157.91
5,Wells Fargo,155.87
6,HSBC Holdings PLC,148.9
7,Morgan Stanley,140.83
8,China Construction Bank,139.82
9,Bank of China,136.81


In [94]:
def extract(url, table_attribs):
    ''' The purpose of this function is to extract the required
    information from the website and save it to a dataframe. The
    function returns the dataframe for further processing. '''

    page = requests.get(url).text
    data = BeautifulSoup(page, 'html.parser')
    df = pd.DataFrame(columns=table_attribs)
    tables = data.find_all('table')
    rows = tables[0].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col)!=0:
            data_dict = {"Name": col[1].text.strip(),
                         "MC_USD_Billion": col[2].text.strip()}
            df1 = pd.DataFrame(data_dict, index=[0])
            df = pd.concat([df, df1], ignore_index=True)
    return df


In [95]:
df = extract(url, table_attribs)
df

Unnamed: 0,Name,MC_USD_Billion
0,JPMorgan Chase,432.92
1,Bank of America,231.52
2,Industrial and Commercial Bank of China,194.56
3,Agricultural Bank of China,160.68
4,HDFC Bank,157.91
5,Wells Fargo,155.87
6,HSBC Holdings PLC,148.9
7,Morgan Stanley,140.83
8,China Construction Bank,139.82
9,Bank of China,136.81


# Task 3 : Transformation of data

In [97]:
# Read the CSV file into a DataFrame
rates = pd.read_csv('exchange_rate.csv')

# Set the 'Currency' column as the index and convert to a dictionary
exchange_rate_dict = rates.set_index('Currency')['Rate'].to_dict()

exchange_rate_dict


{'EUR': 0.93, 'GBP': 0.8, 'INR': 82.95}

In [101]:
# Convert 'MC_USD_Billion' column to numeric if it's not already
df['MC_USD_Billion'] = pd.to_numeric(df['MC_USD_Billion'], errors='coerce')

# Add MC_GBP_Billion column
df['MC_GBP_Billion'] = (df['MC_USD_Billion'] * exchange_rate_dict['GBP']).round(2)

# Add MC_EUR_Billion column
df['MC_EUR_Billion'] = (df['MC_USD_Billion'] * exchange_rate_dict['EUR']).round(2)

# Add MC_INR_Billion column
df['MC_INR_Billion'] = (df['MC_USD_Billion'] * exchange_rate_dict['INR']).round(2)

df


Unnamed: 0,Name,MC_USD_Billion,MC_GBP_Billion,MC_EUR_Billion,MC_INR_Billion
0,JPMorgan Chase,432.92,346.34,402.62,35910.71
1,Bank of America,231.52,185.22,215.31,19204.58
2,Industrial and Commercial Bank of China,194.56,155.65,180.94,16138.75
3,Agricultural Bank of China,160.68,128.54,149.43,13328.41
4,HDFC Bank,157.91,126.33,146.86,13098.63
5,Wells Fargo,155.87,124.7,144.96,12929.42
6,HSBC Holdings PLC,148.9,119.12,138.48,12351.26
7,Morgan Stanley,140.83,112.66,130.97,11681.85
8,China Construction Bank,139.82,111.86,130.03,11598.07
9,Bank of China,136.81,109.45,127.23,11348.39


In [102]:
def transform(df):
    ''' This function converts the GDP information from Currency
    format to float value, transforms the information of GDP from
    USD (Millions) to USD (Billions) rounding to 2 decimal places.
    The function returns the transformed dataframe.'''

    # Read the CSV file into a DataFrame
    rates = pd.read_csv('exchange_rate.csv')
    # Set the 'Currency' column as the index and convert to a dictionary
    xchange_rate_dict = rates.set_index('Currency')['Rate'].to_dict()
    # Convert 'MC_USD_Billion' column to numeric if it's not already
    df['MC_USD_Billion'] = pd.to_numeric(df['MC_USD_Billion'], errors='coerce')
    # Add MC_GBP_Billion column
    df['MC_GBP_Billion'] = (df['MC_USD_Billion'] * exchange_rate_dict['GBP']).round(2)
    # Add MC_EUR_Billion column
    df['MC_EUR_Billion'] = (df['MC_USD_Billion'] * exchange_rate_dict['EUR']).round(2)
    # Add MC_INR_Billion column
    df['MC_INR_Billion'] = (df['MC_USD_Billion'] * exchange_rate_dict['INR']).round(2)
    
    return df

# Task 4: Loading to CSV

In [103]:
def load_to_csv(df, csv_path):
    ''' This function saves the final dataframe as a `CSV` file 
    in the provided path. Function returns nothing.'''

    df.to_csv(csv_path)

In [104]:
load_to_csv(df, csv_path)

# Task 5: Loading to Database

In [105]:
sql_connection = sqlite3.connect(db_name)


In [106]:
def load_to_db(df, sql_connection, table_name):
    ''' This function saves the final dataframe to as a database table
    with the provided name. Function returns nothing.'''

    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)

In [107]:
load_to_db(df, sql_connection, table_name)

# Task 6: Function to Run queries on Database

In [108]:
def run_query(query_statement, sql_connection):
    ''' This function runs the stated query on the database table and
    prints the output on the terminal. Function returns nothing. '''

    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

In [109]:
query_statement = f"SELECT * from {table_name}"
run_query(query_statement, sql_connection)

SELECT * from Largest_banks
                                      Name  MC_USD_Billion  MC_GBP_Billion  \
0                           JPMorgan Chase          432.92          346.34   
1                          Bank of America          231.52          185.22   
2  Industrial and Commercial Bank of China          194.56          155.65   
3               Agricultural Bank of China          160.68          128.54   
4                                HDFC Bank          157.91          126.33   
5                              Wells Fargo          155.87          124.70   
6                        HSBC Holdings PLC          148.90          119.12   
7                           Morgan Stanley          140.83          112.66   
8                  China Construction Bank          139.82          111.86   
9                            Bank of China          136.81          109.45   

   MC_EUR_Billion  MC_INR_Billion  
0          402.62        35910.71  
1          215.31        19204.58  
2    

In [110]:
query_statement = f"SELECT AVG(MC_GBP_Billion) from {table_name}"
run_query(query_statement, sql_connection)

SELECT AVG(MC_GBP_Billion) from Largest_banks
   AVG(MC_GBP_Billion)
0              151.987


In [111]:
query_statement = f"SELECT Name from {table_name} LIMIT 5"
run_query(query_statement, sql_connection)

SELECT Name from Largest_banks LIMIT 5
                                      Name
0                           JPMorgan Chase
1                          Bank of America
2  Industrial and Commercial Bank of China
3               Agricultural Bank of China
4                                HDFC Bank


# Task 7 : Verify log entries

In [113]:
%rm code_log.txt