In [37]:
import pandas as pd
import numpy as np
import requests
import xmltodict
import json
import traceback
from pandas.tseries.offsets import MonthEnd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import platform
from sqlalchemy import create_engine
import pymysql
from tqdm import tqdm

In [38]:
# ÌïúÍ∏Ä Ìè∞Ìä∏ ÏÑ§Ï†ï
if platform.system() == 'Windows':
    plt.rc('font', family='Malgun Gothic')  # ÏúàÎèÑÏö∞ Í∏∞Î≥∏ ÌïúÍ∏Ä Ìè∞Ìä∏
elif platform.system() == 'Darwin':  # macOS
    plt.rc('font', family='AppleGothic')
else:  # Linux (Ïòà: Google Colab)
    plt.rc('font', family='NanumGothic')

# ÎßàÏù¥ÎÑàÏä§ Íπ®Ïßê Î∞©ÏßÄ
plt.rcParams['axes.unicode_minus'] = False

In [39]:
def get_month_date(start, end, freq):
    mydates = pd.period_range(start, end, freq=freq)
    return mydates

def get_period_list(start, end, last_month):

    cut_num = last_month-12
    end = end+1

    period_list = []
    for y in range(start, end):
        for m in range(1,13):
            if len(str(m)) < 2:
                m = str(0)+ str(m)
            else:
                m = str(m)
            #print(str(y)+str(m))
            ym =str(y)+str(m)
            period_list.append(ym)
    period_list = period_list[:cut_num]
    return period_list

## ÌíàÎ™©Î≥Ñ ÏàòÏ∂úÏûÖÏã§Ï†Å Î™©Î°ù Í≤ÄÏÉâ
## 1ÎÖÑ 12Í∞úÏõîÎßå ÌïúÎ≤àÏóê Î∞õÏùÑ Ïàò ÏûàÏùå
# start =ÏãúÏûëÏùº, end = ÎßàÏßÄÎßâÎÇ†, 1ÎÖÑÍ∏∞Í∞Ñ 12, Î∞òÎÖÑ 6, Î∂ÑÍ∏∞ 3, hscode = 6ÏûêÎ¶¨ ÌòπÏùÄ 10ÏûêÎ¶¨

def get_country_export_by_item(start_list, end_list, hs_code):
    df_list = []
    for i, start in enumerate(start_list):
        end = end_list[i]
        service_key = '2o6NG3ixxDgGQ9S4dWUgsMac9WlxfX46%2BJvFRsAlsXQ6xVi6CZewvNJvbHd4S7exkWwt3YWoKSdwvUNb46kSTQ%3D%3D'
        url = f'https://apis.data.go.kr/1220000/Itemtrade/getItemtradeList?serviceKey={service_key}&strtYymm={start}&endYymm={end}&hsSgn={hs_code}'

        try:
            req = requests.get(url)
            json_dict = json.loads(json.dumps(xmltodict.parse(req.text), indent=4))
            items = json_dict['response']['body']['items']
            if items is None:
                print(f"‚ö†Ô∏è No data for HS {hs_code} from {start} to {end}")
                continue
            target_df = pd.DataFrame(items['item'])
            df_list.append(target_df)
        except Exception as e:
            print(f"‚ùå API ÏöîÏ≤≠ Ïã§Ìå®: {hs_code} ({start} ~ {end})")
            print(traceback.format_exc())
            continue

    if df_list:
        return pd.concat(df_list, ignore_index=True)
    else:
        return pd.DataFrame()


def batch_export_by_hscode(cd_list, start_list, end_list, batch_size=20, region_name='Ï†ÑÍµ≠'):
    all_export_q = []
    all_export_m = []
    error_list = []

    for i in range(0, len(cd_list), batch_size):
        hs_code_list = cd_list[i:i+batch_size]
        data_by_hscode = {}

        for hs_code in hs_code_list:
            try:
                target_df = get_country_export_by_item(start_list, end_list, hs_code)
                if target_df.empty:
                    print(f"‚ö†Ô∏è {hs_code}Ïùò Ïú†Ìö®Ìïú Îç∞Ïù¥ÌÑ∞Í∞Ä ÏóÜÏäµÎãàÎã§.")
                    continue

                target_df = target_df[target_df['year'] != 'Ï¥ùÍ≥Ñ'].copy()
                target_df['root_hs_code'] = hs_code
                data_by_hscode[hs_code] = target_df

            except Exception as e:
                print(f"‚ùå {hs_code} Ï≤òÎ¶¨ Ï§ë Ïò§Î•ò Î∞úÏÉù:")
                print(traceback.format_exc())
                error_list.append(hs_code)
                continue

        if not data_by_hscode:
            print("‚ö†Ô∏è Î≥ëÌï©Ìï† Îç∞Ïù¥ÌÑ∞Í∞Ä ÏóÜÏäµÎãàÎã§. Í±¥ÎÑàÎúÅÎãàÎã§.")
            continue

        try:
            merged_df = pd.concat(data_by_hscode).reset_index(drop=True)

            # ÎÇ†Ïßú Ï≤òÎ¶¨
            merged_df['new_date'] = pd.to_datetime(merged_df['year'].str.replace('.', '-')) + MonthEnd(0)
            merged_df.set_index('new_date', inplace=True)
            merged_df['new_year'] = merged_df.index.year
            merged_df['new_quarter'] = merged_df.index.quarter
            merged_df['new_month'] = merged_df.index.month

            # Ïà´ÏûêÌòï Ïª¨Îüº Î≥ÄÌôò
            numeric_cols = ['balPayments', 'expDlr', 'expWgt', 'impDlr', 'impWgt']
            for col in numeric_cols:
                if col in merged_df.columns:
                    merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')
                else:
                    merged_df[col] = 0.0

            # Ï†ïÎ¶¨Îêú Ïª¨Îüº
            clean_df = merged_df[['hsCode', 'new_year', 'new_quarter', 'new_month', 'statKor', 'balPayments', 'expDlr', 'expWgt', 'impDlr', 'impWgt']].copy()
            clean_df['root_hs_code'] = merged_df['root_hs_code'].values
            clean_df['region'] = region_name

            # ÏõîÎ≥Ñ ÏßëÍ≥Ñ
            export_df_by_m = clean_df.groupby(['root_hs_code', 'new_year', 'new_quarter', 'new_month']).agg({
                'balPayments': 'sum', 'expDlr': 'sum', 'impDlr': 'sum'
            }).reset_index()

            # Î∂ÑÍ∏∞Î≥Ñ ÏßëÍ≥Ñ
            export_df_by_q = clean_df.groupby(['root_hs_code', 'new_year', 'new_quarter']).agg({
                'balPayments': 'sum', 'expDlr': 'sum', 'impDlr': 'sum'
            }).reset_index()

            export_df_by_q['region'] = region_name
            export_df_by_m['region'] = region_name

            all_export_q.append(export_df_by_q)
            all_export_m.append(export_df_by_m)

        except Exception as e:
            print("‚ùå Î≥ëÌï©/Ï†ïÎ¶¨ Ï§ë Ïò§Î•ò Î∞úÏÉù")
            print(traceback.format_exc())
            error_list += hs_code_list

    final_q = pd.concat(all_export_q) if all_export_q else pd.DataFrame()
    final_m = pd.concat(all_export_m) if all_export_m else pd.DataFrame()

    return final_q, final_m, error_list

from pandas.tseries.offsets import MonthEnd

def add_yoy_growth(df: pd.DataFrame, steps: int) -> pd.DataFrame:
    """
    root_hs_codeÎ≥ÑÎ°ú expDlr, impDlrÏùò Ï†ÑÎÖÑÎèôÍ∏∞ÎåÄÎπÑ Ï¶ùÍ∞ÄÏú®(%)ÏùÑ Í≥ÑÏÇ∞ÌïòÍ≥†,
    'date' Ïª¨ÎüºÏùÑ ÏõîÎßê Í∏∞Ï§ÄÏúºÎ°ú Ï∂îÍ∞ÄÌïòÎäî Ìï®Ïàò.
    Ïõî Îã®ÏúÑ Îç∞Ïù¥ÌÑ∞Îäî steps=12, Î∂ÑÍ∏∞ Îã®ÏúÑ Îç∞Ïù¥ÌÑ∞Îäî steps=4Î°ú ÏÑ§Ï†ïÌï©ÎãàÎã§.
    """
    df = df.copy()

    if steps == 12:
        # Ïõî Í∏∞Ï§Ä: ÏõîÎßê ÎÇ†Ïßú ÏÉùÏÑ±
        df['date'] = pd.to_datetime(df['new_year'].astype(str) + '-' + df['new_month'].astype(str) + '-01') + MonthEnd(0)

    elif steps == 4:
        # Î∂ÑÍ∏∞ Í∏∞Ï§Ä: Î∂ÑÍ∏∞ ÎßàÏßÄÎßâ ÏõîÎ°ú Îß§Ìïë
        end_month = df['new_quarter'].map({1: '03', 2: '06', 3: '09', 4: '12'})
        df['date'] = pd.to_datetime(df['new_year'].astype(str) + '-' + end_month + '-01') + MonthEnd(0)

    else:
        raise ValueError("steps Í∞íÏùÄ 12(Ïõî Îã®ÏúÑ) ÎòêÎäî 4(Î∂ÑÍ∏∞ Îã®ÏúÑ)Ïó¨Ïïº Ìï©ÎãàÎã§.")

    # Ï†ïÎ†¨ Î∞è Ï†ÑÎÖÑÎèôÍ∏∞ÎåÄÎπÑ ÏÑ±Ïû•Î•† Í≥ÑÏÇ∞
    df = df.sort_values(['root_hs_code', 'date'])

    df['expDlr_yoy'] = df.groupby('root_hs_code')['expDlr'].transform(lambda x: x.pct_change(periods=steps))
    df['impDlr_yoy'] = df.groupby('root_hs_code')['impDlr'].transform(lambda x: x.pct_change(periods=steps))

    return df

def plot_column_by_hscode(df, hs_code, col_name, start_date=None, end_date=None):
    """
    ÌäπÏ†ï root_hs_codeÏóê ÎåÄÌï¥ year_monthÎ•º XÏ∂ï, ÏßÄÏ†ïÎêú Ïª¨Îüº(col_name)ÏùÑ YÏ∂ïÏúºÎ°ú ÌïòÎäî ÎùºÏù∏Ï∞®Ìä∏Î•º Í∑∏Î¶ΩÎãàÎã§.

    Parameters:
        df (pd.DataFrame): 'year_month', 'root_hs_code', col_name Ïª¨ÎüºÏù¥ Ìè¨Ìï®Îêú DataFrame
        hs_code (str): ÏãúÍ∞ÅÌôîÌï† root_hs_code
        col_name (str): YÏ∂ïÏóê ÏÇ¨Ïö©Ìï† Ïª¨Îüº Ïù¥Î¶Ñ
        start_date (str or pd.Timestamp): ÏãúÏûë ÎÇ†Ïßú (Ïòà: '2020-01-01')
        end_date (str or pd.Timestamp): Ï¢ÖÎ£å ÎÇ†Ïßú (Ïòà: '2024-12-31')
    """
    # ÌïÑÌÑ∞ÎßÅ Î∞è Ï†ïÎ†¨
    target_df = df[df['root_hs_code'] == hs_code].sort_values('year_month')

    if target_df.empty:
        print(f"‚ö†Ô∏è root_hs_code {hs_code}Ïóê Ìï¥ÎãπÌïòÎäî Îç∞Ïù¥ÌÑ∞Í∞Ä ÏóÜÏäµÎãàÎã§.")
        return

    if col_name not in target_df.columns:
        print(f"‚ùå '{col_name}' Ïª¨ÎüºÏù¥ DataFrameÏóê ÏóÜÏäµÎãàÎã§.")
        return

    # ÎÇ†Ïßú Î≤îÏúÑ ÌïÑÌÑ∞ÎßÅ
    if start_date:
        target_df = target_df[target_df['year_month'] >= pd.to_datetime(start_date)]
    if end_date:
        target_df = target_df[target_df['year_month'] <= pd.to_datetime(end_date)]

    if target_df.empty:
        print(f"‚ö†Ô∏è ÏßÄÏ†ïÌïú ÎÇ†Ïßú Î≤îÏúÑÏóê Îç∞Ïù¥ÌÑ∞Í∞Ä ÏóÜÏäµÎãàÎã§.")
        return

    # Plot
    plt.figure(figsize=(12, 6))
    plt.plot(target_df['year_month'], target_df[col_name], marker='o', label=col_name)

    # ÎßàÏßÄÎßâ Í∞íÏóê ÌÖçÏä§Ìä∏ ÌëúÏãú
    last_x = target_df['year_month'].iloc[-1]
    last_y = target_df[col_name].iloc[-1]
    plt.text(last_x, last_y, f"{last_y * 100:,.2f}%", fontsize=25, ha='left', va='bottom', color='red')

    plt.title(f"{col_name} Ï∂îÏù¥ (root_hs_code: {hs_code})")
    plt.xlabel("Year-Month")
    plt.ylabel(col_name)
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def reshape_to_long(df: pd.DataFrame) -> pd.DataFrame:
    id_vars = ['date', 'root_hs_code']
    value_vars = ['balPayments', 'expDlr', 'impDlr', 'expDlr_yoy', 'impDlr_yoy']

    long_df = df.melt(id_vars=id_vars, value_vars=value_vars,
                      var_name='indicator', value_name='value')
    return long_df

def upload_long_format_to_db(df_long: pd.DataFrame, db_info: dict, table_name: str = 'korea_monthly_trade_data', chunk_size: int = 1000):
    """
    long-format ÌòïÌÉúÏùò Î¨¥Ïó≠ Îç∞Ïù¥ÌÑ∞Î•º MySQL/MariaDBÏóê ÏóÖÎ°úÎìúÌïòÎäî Ìï®Ïàò (tqdm Ìè¨Ìï®)

    Parameters:
        df_long (pd.DataFrame): 'date', 'root_hs_code', 'indicator', 'value' Ìè¨Ìï®
        db_info (dict): DB Ï†ëÏÜç Ï†ïÎ≥¥
        table_name (str): ÏóÖÎ°úÎìúÌï† ÌÖåÏù¥Î∏îÎ™Ö
        chunk_size (int): tqdm ÌëúÏãúÏö© ÏóÖÎ°úÎìú Ï≤≠ÌÅ¨ ÌÅ¨Í∏∞ (Í∏∞Î≥∏Í∞í: 1000)
    """

    # ‚úÖ ÎÇ†Ïßú Ìè¨Îß∑ Ï†ïÎ¶¨
    df_long['date'] = pd.to_datetime(df_long['date'])

    # ‚úÖ Í≤∞Ï∏°Ïπò Î∞è inf Ï≤òÎ¶¨
    df_long = df_long.replace([np.inf, -np.inf], np.nan)
    df_long = df_long.where(pd.notnull(df_long), None)

    # ‚úÖ SQLAlchemy Ïó∞Í≤∞
    engine = create_engine(
        f"mysql+pymysql://{db_info['user']}:{db_info['password']}@{db_info['host']}:{db_info['port']}/{db_info['database']}"
    )
    conn = engine.raw_connection()
    cursor = conn.cursor()

    # ‚úÖ ÌÖåÏù¥Î∏î ÏÉùÏÑ± ÏøºÎ¶¨
    create_table_sql = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        `date` DATE,
        `root_hs_code` VARCHAR(20),
        `indicator` VARCHAR(50),
        `value` FLOAT,
        PRIMARY KEY (`date`, `root_hs_code`, `indicator`)
    );
    """
    cursor.execute(create_table_sql)
    conn.commit()

    # ‚úÖ Í∏∞Ï°¥ Îç∞Ïù¥ÌÑ∞ Ï°∞Ìöå
    existing_query = f"SELECT `date`, `root_hs_code`, `indicator` FROM {table_name}"
    existing_df = pd.read_sql(existing_query, engine)
    
    # ‚úÖ ÎÇ†Ïßú ÌòïÏãùÏùÑ datetimeÏúºÎ°ú Î≥ÄÌôò
    existing_df['date'] = pd.to_datetime(existing_df['date'])
    
    # ‚úÖ Ï§ëÎ≥µ Ï†úÍ±∞
    merged = pd.merge(df_long, existing_df, on=['date', 'root_hs_code', 'indicator'], how='left', indicator=True)
    df_to_upload = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])


    # ‚úÖ ÏóÖÎ°úÎìú Ïã§Ìñâ with tqdm
    if not df_to_upload.empty:
        print(f"üöÄ ÏóÖÎ°úÎìú ÎåÄÏÉÅ {len(df_to_upload)}Í±¥ ‚Üí chunk_size={chunk_size}")
        for i in tqdm(range(0, len(df_to_upload), chunk_size), desc="Uploading"):
            chunk = df_to_upload.iloc[i:i + chunk_size]
            chunk.to_sql(name=table_name, con=engine, if_exists='append', index=False)
        print(f"‚úÖ Ï¥ù {len(df_to_upload)}Í±¥ ÏóÖÎ°úÎìú ÏôÑÎ£å")
    else:
        print("‚ö†Ô∏è ÏóÖÎ°úÎìúÌï† ÏÉàÎ°úÏö¥ Îç∞Ïù¥ÌÑ∞Í∞Ä ÏóÜÏäµÎãàÎã§.")

    # ‚úÖ Ïó∞Í≤∞ Ï¢ÖÎ£å
    cursor.close()
    conn.close()
    

In [41]:
data = pd.read_excel(r'C:\Users\MetaM\PycharmProjects\pythonProject3\HS_Code_500\HS_Code_500.xlsx')
cd_array = data['HS_Code'].unique()
cd_list = cd_array.tolist()
cd_list

[854232,
 271019,
 854231,
 890590,
 870323,
 847330,
 870380,
 271012,
 854239,
 852412,
 870322,
 870324,
 890190,
 890120,
 870340,
 850760,
 852990,
 870899,
 284190,
 330499,
 847989,
 290243,
 852351,
 853400,
 903149,
 848690,
 290220,
 721049,
 870332,
 870840,
 851779,
 382499,
 848620,
 870829,
 401110,
 720851,
 300215,
 850710,
 390120,
 710691,
 260300,
 852411,
 390210,
 852491,
 870321,
 851713,
 853710,
 760612,
 300214,
 840999,
 870850,
 291736,
 847990,
 300249,
 853890,
 721070,
 390130,
 842952,
 290122,
 720839,
 391990,
 840810,
 392690,
 841810,
 390140,
 880730,
 390230,
 843149,
 390330,
 900120,
 848640,
 732690,
 390740,
 848180,
 720917,
 390690,
 870360,
 290121,
 848071,
 854449,
 850423,
 840991,
 870421,
 740311,
 903180,
 850440,
 854370,
 392062,
 902129,
 720916,
 842890,
 853224,
 901890,
 840130,
 730890,
 852492,
 841112,
 820730,
 381800,
 842720,
 790111,
 848630,
 190230,
 853669,
 841989,
 392119,
 853690,
 210690,
 903289,
 270730,
 851762,
 

In [29]:
# cd_list = ['854231', '854232', '854239']
start_list = [ '200701', '200801', '200901', 
              '201001', '201101', '201201', '201301', '201401', 
              '201501', '201601', '201701', '201801', '201901',
              '202001', '202101', '202201', '202301', '202401', '202501']
end_list = [ '200712', '200812', '200912', 
              '201012', '201112', '201212', '201312', '201412', 
              '201512', '201612', '201712', '201812', '201912',
              '202012', '202112', '202212', '202312', '202412', '202512']

export_q, export_m, error_list = batch_export_by_hscode(cd_list, start_list, end_list)

‚ö†Ô∏è No data for HS 870380 from 200701 to 200712
‚ö†Ô∏è No data for HS 870380 from 200801 to 200812
‚ö†Ô∏è No data for HS 870380 from 200901 to 200912
‚ö†Ô∏è No data for HS 870380 from 201001 to 201012
‚ö†Ô∏è No data for HS 870380 from 201101 to 201112
‚ö†Ô∏è No data for HS 870380 from 201201 to 201212
‚ö†Ô∏è No data for HS 870380 from 201301 to 201312
‚ö†Ô∏è No data for HS 870380 from 201401 to 201412
‚ö†Ô∏è No data for HS 870380 from 201501 to 201512
‚ö†Ô∏è No data for HS 870380 from 201601 to 201612
‚ö†Ô∏è No data for HS 271012 from 200701 to 200712
‚ö†Ô∏è No data for HS 271012 from 200801 to 200812
‚ö†Ô∏è No data for HS 271012 from 200901 to 200912
‚ö†Ô∏è No data for HS 271012 from 201001 to 201012
‚ö†Ô∏è No data for HS 271012 from 201101 to 201112
‚ö†Ô∏è No data for HS 852412 from 200701 to 200712
‚ö†Ô∏è No data for HS 852412 from 200801 to 200812
‚ö†Ô∏è No data for HS 852412 from 200901 to 200912
‚ö†Ô∏è No data for HS 852412 from 201001 to 201012
‚ö†Ô∏è No data for HS 852412 fr

In [30]:
# export_df_by_m ÏùÄ ÏõîÎ≥Ñ ÏàòÏ∂úÏûÖ Îç∞Ïù¥ÌÑ∞ÎùºÍ≥† Í∞ÄÏ†ï
export_m_with_yoy = add_yoy_growth(export_m,steps=12)
export_q_with_yoy = add_yoy_growth(export_q,steps=4)

export_m_with_yoy_resize = export_m_with_yoy[['date', 'root_hs_code', 'balPayments', 'expDlr', 'impDlr', 'expDlr_yoy', 'impDlr_yoy']].copy()
export_q_with_yoy_resize = export_m_with_yoy[['date', 'root_hs_code', 'balPayments', 'expDlr', 'impDlr', 'expDlr_yoy', 'impDlr_yoy']].copy()

export_m_with_yoy_resize['root_hs_code'] = export_m_with_yoy_resize['root_hs_code'].astype(str)
export_q_with_yoy_resize['root_hs_code'] = export_q_with_yoy_resize['root_hs_code'].astype(str)

trade_data_monthly = reshape_to_long(export_m_with_yoy_resize)
trade_data_quarterly= reshape_to_long(export_q_with_yoy_resize)

In [35]:
# DB Ï†ïÎ≥¥
db_info = {
    'host': 'hystox74.synology.me',
    'port': 3307,
    'user': 'stox7412',
    'password': 'Apt106503!~',
    'database': 'investar'
}

# long-format Î≥ÄÌôò
df_wide = export_m_with_yoy_resize.copy()
df_long = reshape_to_long(df_wide)

# ÏóÖÎ°úÎìú Ïã§Ìñâ
upload_long_format_to_db(df_long, db_info)

üöÄ ÏóÖÎ°úÎìú ÎåÄÏÉÅ 493525Í±¥ ‚Üí chunk_size=1000


Uploading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 494/494 [01:06<00:00,  7.41it/s]

‚úÖ Ï¥ù 493525Í±¥ ÏóÖÎ°úÎìú ÏôÑÎ£å





In [36]:
upload_long_format_to_db(trade_data_quarterly, db_info, table_name = 'korea_quarterly_trade_data')

üöÄ ÏóÖÎ°úÎìú ÎåÄÏÉÅ 494625Í±¥ ‚Üí chunk_size=1000


Uploading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 495/495 [00:47<00:00, 10.41it/s]

‚úÖ Ï¥ù 494625Í±¥ ÏóÖÎ°úÎìú ÏôÑÎ£å



