In [1]:
import time
import math
import pandas as pd
import requests
import io

COUNTRY = "VNM"
TRADEFLOW = "Exports"         # "Exports" | "Imports"
PARTNER = "ALL"
PRODUCT = "090111"            # HS code
YEARS = range(2005, 2025)     # 2005 → 2024

def fetch_year(year: int) -> pd.DataFrame | None:
    url = f"https://wits.worldbank.org/trade/comtrade/en/country/{COUNTRY}/year/{year}/tradeflow/{TRADEFLOW}/partner/{PARTNER}/product/{PRODUCT}"
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        html = requests.get(url, headers=headers, timeout=30).text
        tables = pd.read_html(io.StringIO(html))
        # Lấy bảng dài nhất (thường là bảng dữ liệu chính)
        df = max(tables, key=lambda t: t.shape[0]).copy()
    except Exception as e:
        print(f"[{year}] lỗi lấy/đọc bảng: {e}")
        return None

    # Chuẩn hóa cột
    df.columns = [str(c).strip().replace("\n", " ") for c in df.columns]

    # Thêm cột Year nếu trang không có
    if "Year" not in df.columns:
        df["Year"] = year

    # Chuyển các cột số (nếu có) về float
    for c in ["Trade Value 1000USD", "Quantity"]:
        if c in df.columns:
            df[c] = (df[c]
                     .astype(str)
                     .str.replace(",", "", regex=False)
                     .str.strip()
                     .replace({"": None, "nan": None}))
            # chỉ ép kiểu với chuỗi số hợp lệ
            df[c] = pd.to_numeric(df[c], errors="coerce")

    return df

all_frames = []
for y in YEARS:
    print(f"Đang lấy {y}…")
    df_y = fetch_year(y)
    if df_y is not None and len(df_y):
        all_frames.append(df_y)
        print(f"  -> {len(df_y)} dòng")
    else:
        print("  -> không thấy bảng hoặc rỗng")
    time.sleep(1.2)  # lịch sự với server

if not all_frames:
    raise SystemExit("Không thu được năm nào.")

data = pd.concat(all_frames, ignore_index=True)

# Use the first row as header and then drop it
data.columns = data.iloc[0]
data = data[1:].copy()

# Remove rows that are duplicates of the header (after setting the header)
data = data[data['Reporter'] != 'Reporter'].copy()


# Sắp xếp và lưu
sort_cols = [c for c in ["Year", "Partner", "Trade Value 1000USD"] if c in data.columns]
if sort_cols:
    data = data.sort_values(sort_cols).reset_index(drop=True)

out_path = f"wits_{COUNTRY}_{TRADEFLOW}_{PRODUCT}_2005_2024.csv"
data.to_csv(out_path, index=False)
print(f"Đã lưu: {out_path} ({len(data)} dòng, {data['Year'].nunique()} năm)")

Đang lấy 2005…
  -> 89 dòng
Đang lấy 2006…
  -> 81 dòng
Đang lấy 2007…
  -> 90 dòng
Đang lấy 2008…
  -> 86 dòng
Đang lấy 2009…
  -> 78 dòng
Đang lấy 2010…
  -> 87 dòng
Đang lấy 2011…
  -> 70 dòng
Đang lấy 2012…
  -> 77 dòng
Đang lấy 2013…
  -> 72 dòng
Đang lấy 2014…
  -> 78 dòng
Đang lấy 2015…
  -> 78 dòng
Đang lấy 2016…
  -> 78 dòng
Đang lấy 2017…
  -> 79 dòng
Đang lấy 2018…
  -> 66 dòng
Đang lấy 2019…
  -> 68 dòng
Đang lấy 2020…
  -> 67 dòng
Đang lấy 2021…
  -> 67 dòng
Đang lấy 2022…
  -> 70 dòng
Đang lấy 2023…
  -> 72 dòng
Đang lấy 2024…
  -> 1 dòng
Đã lưu: wits_VNM_Exports_090111_2005_2024.csv (1434 dòng, 19 năm)


In [2]:
!pip install pymysql

Collecting pymysql
  Downloading pymysql-1.1.2-py3-none-any.whl.metadata (4.3 kB)
Downloading pymysql-1.1.2-py3-none-any.whl (45 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.1.2


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pymysql
from dotenv import load_dotenv
import os

In [4]:
load_dotenv()
HOST = os.getenv("HOST")
PORT = int(os.getenv("PORT", "3306"))
USER = os.getenv("USER")
PASSWORD = os.getenv("PASSWORD")
DB = os.getenv("NAME")
CA_PEM = os.getenv("CA_PEM")
if not all([HOST, PORT, USER, PASSWORD, DB, CA_PEM]):
    raise SystemExit("Missing env vars. Set HOST, PORT, USER, PASSWORD, DB, CA_PEM in .env")

In [5]:
connection = pymysql.connect(
        host=HOST,
        port=PORT,
        user=USER,
        password=PASSWORD,
        database= DB,
        charset='utf8mb4',
        cursorclass=pymysql.cursors.DictCursor,
        ssl={'ssl_disabled': False}  # Enable SSL for Aiven
    )

cursor = connection.cursor()
print("Database connected successfully!")

Database connected successfully!


# Task
Upload the `data` DataFrame to the `export_country` table in the connected database.

## Prepare data for database

### Subtask:
Select and rename columns in the `data` DataFrame to match potential database table column names.


**Reasoning**:
Select and rename the columns in the `data` DataFrame to prepare it for uploading to the database.



In [6]:
df_prepared = data[['Year', 'Partner', 'Trade Value 1000USD', 'Quantity', 'Quantity Unit']].copy()
df_prepared.rename(columns={
    'Year': 'year',
    'Partner': 'partner',
    'Trade Value 1000USD': 'trade_value_1000usd',
    'Quantity': 'quantity',
    'Quantity Unit': 'quantity_unit'
}, inplace=True)

## Create database table

### Subtask:
Generate SQL code to create the `export_country` table in the database with appropriate column types based on the DataFrame.


**Reasoning**:
Generate SQL code to create the `export_country` table with appropriate column types and execute it using the cursor.



In [7]:
drop_table_sql = """
DROP TABLE IF EXISTS export_country;
"""
cursor.execute(drop_table_sql)

create_table_sql = """
CREATE TABLE export_country (
    year INT NOT NULL,
    partner VARCHAR(255) NOT NULL,
    trade_value_1000usd DECIMAL(15, 2),
    quantity BIGINT,
    quantity_unit VARCHAR(50),
    PRIMARY KEY (year, partner)
);
"""
cursor.execute(create_table_sql)
connection.commit()
print("Table 'export_country' created successfully.")

Table 'export_country' created successfully.


## Insert data into database

### Subtask:
Convert the DataFrame to a list of dictionaries or tuples and insert the data into the `export_country` table using the established database connection.


**Reasoning**:
Convert the DataFrame to a list of tuples and insert the data into the `export_country` table.



In [8]:
data_to_insert = [tuple(row) for row in df_prepared.values]

insert_sql = """
INSERT INTO export_country (year, partner, trade_value_1000usd, quantity, quantity_unit)
VALUES (%s, %s, %s, %s, %s)
"""

cursor.executemany(insert_sql, data_to_insert)
connection.commit()

print(f"Successfully inserted {len(data_to_insert)} rows into the export_country table.")

Successfully inserted 1434 rows into the export_country table.


## Verify data insertion

### Subtask:
Query the database to check if the data was inserted correctly.


**Reasoning**:
Execute SQL queries to count the rows and select the first few rows from the `export_country` table to verify data insertion.



In [9]:
cursor.execute("SELECT COUNT(*) FROM export_country;")
row_count = cursor.fetchone()['COUNT(*)']
print(f"Number of rows in export_country table: {row_count}")

cursor.execute("SELECT * FROM export_country LIMIT 5;")
first_rows = cursor.fetchall()
print("First 5 rows in export_country table:")
for row in first_rows:
    print(row)

Number of rows in export_country table: 1434
First 5 rows in export_country table:
{'year': 2005, 'partner': 'Albania', 'trade_value_1000usd': Decimal('2447.54'), 'quantity': 1672150, 'quantity_unit': 'Kg'}
{'year': 2005, 'partner': 'Algeria', 'trade_value_1000usd': Decimal('7104.00'), 'quantity': 4853450, 'quantity_unit': 'Kg'}
{'year': 2005, 'partner': 'Argentina', 'trade_value_1000usd': Decimal('162.61'), 'quantity': 111098, 'quantity_unit': 'Kg'}
{'year': 2005, 'partner': 'Armenia', 'trade_value_1000usd': Decimal('85.74'), 'quantity': 58575, 'quantity_unit': 'Kg'}
{'year': 2005, 'partner': 'Australia', 'trade_value_1000usd': Decimal('11040.54'), 'quantity': 7542890, 'quantity_unit': 'Kg'}


In [18]:
update_unit_sql = """
UPDATE export_country
SET quantity_unit = 'Tons';
"""
cursor.execute(update_unit_sql)
connection.commit()
print("Quantity unit column updated to 'Tons'.")

Quantity unit column updated to 'Tons'.


In [17]:
update_sql = """
UPDATE export_country
SET quantity = quantity / 1000;
"""
cursor.execute(update_sql)
connection.commit()
print("Quantity column updated to metric tons.")

Quantity column updated to metric tons.


In [19]:
def show_top_partners_by_year(year):
    """
    Displays the top 5 countries by market share for a given year.

    Args:
        year: The year to filter the data.
    """
    sql_query = f"""
    SELECT partner, trade_value_1000usd, quantity
    FROM export_country
    WHERE year = {year}
    ORDER BY trade_value_1000usd DESC
    LIMIT 10;
    """

    cursor.execute(sql_query)
    results = cursor.fetchall()

    if not results:
        print(f"No data found for year {year}")
        return

    df_year_data = pd.DataFrame(results)
    print(f"Top 5 partners for year {year}:")
    display(df_year_data)


In [20]:
year_input = int(input("Enter a year: "))
show_top_partners_by_year(year_input)

Enter a year: 2023
Top 5 partners for year 2023:


Unnamed: 0,partner,trade_value_1000usd,quantity
0,World,2977954.67,1035180
1,Germany,440611.67,155843
2,Italy,318124.61,115835
3,United States,247811.86,84899
4,Japan,233694.7,76541
5,Russian Federation,192279.58,68949
6,Spain,160427.75,55763
7,Algeria,160171.54,53462
8,Belgium,138717.43,49351
9,Netherlands,107551.09,38337
