In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from DATA.stock_invest_function import *


In [2]:
def calculate_correlation_between_dfs(df1, df2, start_date=None, end_date=None, method='pearson', min_periods=4):
    """
    Îëê Í∞úÏùò ÏãúÍ≥ÑÏó¥ DataFrameÏùò ÏÉÅÍ¥ÄÍ¥ÄÍ≥ÑÎ•º Í≥ÑÏÇ∞ÌïòÎêò, Ïú†Ìö® Í¥ÄÏ∏°ÏπòÍ∞Ä min_periodsÎ≥¥Îã§ ÎßéÏùÑ Í≤ΩÏö∞Îßå ÏàòÌñâ

    Parameters:
    ...
    - min_periods (int): ÏµúÏÜå Ïú†Ìö® Îç∞Ïù¥ÌÑ∞ Ïàò

    Returns:
    - pd.DataFrame: ÏÉÅÍ¥ÄÍ≥ÑÏàò Îß§Ìä∏Î¶≠Ïä§
    """
    if start_date:
        df1 = df1[df1.index >= pd.to_datetime(start_date)]
        df2 = df2[df2.index >= pd.to_datetime(start_date)]
    if end_date:
        df1 = df1[df1.index <= pd.to_datetime(end_date)]
        df2 = df2[df2.index <= pd.to_datetime(end_date)]

    combined = pd.merge(df1, df2, left_index=True, right_index=True, how='inner', suffixes=('_firm', '_hs'))

    corr_matrix = pd.DataFrame(index=df1.columns, columns=df2.columns, dtype=float)

    for firm in df1.columns:
        for hs in df2.columns:
            x = combined[firm]
            y = combined[hs]
            valid = x.notna() & y.notna()
            if valid.sum() >= min_periods:
                corr_matrix.loc[firm, hs] = x[valid].corr(y[valid], method=method)
            else:
                corr_matrix.loc[firm, hs] = np.nan  # ÎòêÎäî 0

    return corr_matrix

def get_top_correlated_hscode(corr_matrix, symbol, top_n=5, threshold=None, ascending=False):
    """
    ÌäπÏ†ï Í∏∞ÏóÖ(Symbol)Ïóê ÎåÄÌï¥ ÏÉÅÍ¥ÄÍ¥ÄÍ≥ÑÍ∞Ä ÎÜíÏùÄ HS ÏΩîÎìúÎ•º Ï∂îÏ∂úÌïòÎäî Ìï®Ïàò

    Parameters:
    - corr_matrix (pd.DataFrame): Symbol x HS_Code ÌòïÌÉúÏùò ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ ÌñâÎ†¨
    - symbol (str): ÎåÄÏÉÅ Symbol (Ïòà: '000080')
    - top_n (int): ÏÉÅÏúÑ NÍ∞ú Ï∂îÏ∂ú (thresholdÏôÄ Ìï®Íªò ÏÇ¨Ïö© Ïãú Î¨¥ÏãúÎê† Ïàò ÏûàÏùå)
    - threshold (float or None): ÏÉÅÍ¥ÄÍ≥ÑÏàò ÌïòÌïúÍ∞í (Ïòà: 0.5 Ïù¥ÏÉÅÎßå Î≥¥Í∏∞). ÏÑ§Ï†ï Ïãú top_nÎ≥¥Îã§ Ïö∞ÏÑ†Ìï®
    - ascending (bool): ÏÉÅÍ¥ÄÍ≥ÑÏàò Í∏∞Ï§Ä Ïò§Î¶ÑÏ∞®Ïàú Ï†ïÎ†¨ Ïó¨Î∂Ä (Í∏∞Î≥∏Í∞í: False = ÎÜíÏùÄ Í∞í Ïö∞ÏÑ†)

    Returns:
    - pd.DataFrame: root_hs_code Î∞è ÏÉÅÍ¥ÄÍ≥ÑÏàòÎ•º Ìè¨Ìï®Ìïú ÏÉÅÏúÑ NÍ∞ú HS ÏΩîÎìú
    """

    if symbol not in corr_matrix.index:
        raise ValueError(f"Symbol '{symbol}' not found in correlation matrix.")

    symbol_corr = corr_matrix.loc[symbol].dropna()

    if threshold is not None:
        symbol_corr = symbol_corr[symbol_corr >= threshold]

    top_correlated = symbol_corr.sort_values(ascending=ascending).head(top_n)

    return top_correlated.reset_index().rename(columns={'index': 'root_hs_code', symbol: 'correlation'})

def get_top_correlated_symbols(corr_matrix, hs_code, top_n=5, threshold=None, ascending=False):
    """
    ÌäπÏ†ï HS ÏΩîÎìúÏóê ÎåÄÌï¥ ÏÉÅÍ¥ÄÍ¥ÄÍ≥ÑÍ∞Ä ÎÜíÏùÄ Í∏∞ÏóÖ SymbolÏùÑ Ï∂îÏ∂úÌïòÎäî Ìï®Ïàò

    Parameters:
    - corr_matrix (pd.DataFrame): Symbol x HS_Code ÌòïÌÉúÏùò ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ ÌñâÎ†¨
    - hs_code (str or int): ÎåÄÏÉÅ HS ÏΩîÎìú (Ïòà: '151550')
    - top_n (int): ÏÉÅÏúÑ NÍ∞ú Ï∂îÏ∂ú
    - threshold (float or None): ÏÉÅÍ¥ÄÍ≥ÑÏàò ÌïòÌïúÍ∞í (Ïòà: 0.5 Ïù¥ÏÉÅÎßå Î≥¥Í∏∞)
    - ascending (bool): Ï†ïÎ†¨ Î∞©Ìñ• (False: ÎÜíÏùÄ ÏÉÅÍ¥Ä Ïö∞ÏÑ†)

    Returns:
    - pd.DataFrame: symbol Î∞è correlation Ï†ïÎ≥¥Î•º Îã¥ÏùÄ ÏÉÅÏúÑ NÍ∞ú Í≤∞Í≥º
    """

    if hs_code not in corr_matrix.columns:
        raise ValueError(f"HS code '{hs_code}' not found in correlation matrix columns.")

    hs_corr = corr_matrix[hs_code].dropna()

    if threshold is not None:
        hs_corr = hs_corr[hs_corr >= threshold]

    top_symbols = hs_corr.sort_values(ascending=ascending).head(top_n)

    return top_symbols.reset_index().rename(columns={'index': 'symbol', hs_code: 'correlation'})


In [3]:
db_info = {
    'host': 'hystox74.synology.me',
    'port': 3307,
    'user': 'stox7412',
    'password': 'Apt106503!~',
    'database': 'investar'
}

# SQLAlchemy ÏóîÏßÑ ÏÉùÏÑ±
engine = create_engine(
    f"mysql+pymysql://{db_info['user']}:{db_info['password']}@{db_info['host']}:{db_info['port']}/{db_info['database']}"
)

# ÌÖåÏù¥Î∏î Ïù¥Î¶Ñ
table_name = 'target_hs_code'

# Í≥†Ïú†Ìïú hs_code Í∞í Ï∂îÏ∂ú ÏøºÎ¶¨ Ïã§Ìñâ
query = f"SELECT DISTINCT hs_code FROM {table_name}"
unique_hs_codes_df = pd.read_sql(query, con=engine)
hs_codes  = unique_hs_codes_df['hs_code'].unique().tolist()

indicator = 'expDlr'

df_real = fetch_trade_data_multi_hscode(db_info, hs_codes, indicator)

# Î∂ÑÍ∏∞ Ï†ïÎ≥¥ Ï∂îÍ∞Ä
df_real['quarter'] = df_real['date'].dt.to_period('Q')

# Í∑∏Î£πÎ≥ÑÎ°ú Î∂ÑÍ∏∞Î≥Ñ Ìï©ÏÇ∞
df_quarterly = (
    df_real
    .groupby(['root_hs_code', 'quarter'])['value']
    .sum()
    .reset_index()
)

# üëâ Î∂ÑÍ∏∞ ÏõîÎßêÎ°ú Î≥ÄÌôò (Ïòà: 2007Q1 ‚Üí 2007-03-31)
df_quarterly['date'] = df_quarterly['quarter'].dt.to_timestamp(how='end')

# üëâ 'quarter' Ïª¨Îüº Ï†úÍ±∞
df_quarterly.drop(columns=['quarter'], inplace=True)

# 1Îã®Í≥Ñ: Î¨∏ÏûêÏó¥Î°ú ÏßÅÏ†ë Î≥ÄÌôòÌïòÎ†§Î©¥ to_datetime Ïù¥ÌõÑÏóê Î∞îÎ°ú strftime
df_quarterly['date'] = pd.to_datetime(df_quarterly['date']).dt.strftime('%Y-%m-%d')

def create_yoy_growth_pivot(df_quarterly, start_date=None, end_date=None):
    """
    Ï†ÑÎÖÑ ÎèôÎ∂ÑÍ∏∞ ÎåÄÎπÑ Ï¶ùÍ∞ÄÏú®ÏùÑ pivot ÌòïÌÉúÎ°ú Î≥ÄÌôòÌïòÍ≥† Î∂ÑÏÑùÍ∏∞Í∞ÑÏùÑ ÏÑ§Ï†ïÌï† Ïàò ÏûàÎäî Ìï®Ïàò

    Parameters:
    - df_quarterly (DataFrame): 'root_hs_code', 'date', 'yoy_growth' Ìè¨Ìï®Îêú Îç∞Ïù¥ÌÑ∞
    - start_date (str or None): Î∂ÑÏÑù ÏãúÏûëÏùº (Ïòà: '2015-01-01')
    - end_date (str or None): Î∂ÑÏÑù Ï¢ÖÎ£åÏùº (Ïòà: '2023-12-31')

    Returns:
    - pivot_df (DataFrame): Ìñâ: date, Ïó¥: root_hs_code, Í∞í: yoy_growth
    """
    # Pivot
    pivot_df = df_quarterly.pivot(
        index='date',
        columns='root_hs_code',
        values='yoy_growth'
    ).sort_index()

    # inf Í∞í NaN Ï≤òÎ¶¨
    pivot_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Î∂ÑÏÑù Í∏∞Í∞Ñ Ïä¨ÎùºÏù¥Ïã± (ÎÇ†ÏßúÍ∞Ä Î¨∏ÏûêÏó¥Ïù¥Î©¥ datetimeÏúºÎ°ú Î≥ÄÌôò)
    pivot_df.index = pd.to_datetime(pivot_df.index)

    if start_date:
        pivot_df = pivot_df[pivot_df.index >= pd.to_datetime(start_date)]
    if end_date:
        pivot_df = pivot_df[pivot_df.index <= pd.to_datetime(end_date)]

    return pivot_df


# Ï†ÑÎÖÑ ÎèôÎ∂ÑÍ∏∞ Í∞í (4Í∞ú Î∂ÑÍ∏∞ Ï†Ñ Í∞í) Í≥ÑÏÇ∞
df_quarterly['yoy_value'] = (
    df_quarterly
    .sort_values(['root_hs_code', 'date'])
    .groupby('root_hs_code')['value']
    .shift(4)
)

# ‚ùó yoy_growth Í≥ÑÏÇ∞
df_quarterly['yoy_growth'] = (
    (df_quarterly['value'] - df_quarterly['yoy_value']) / df_quarterly['yoy_value']
) * 100

quarterly_trade_data = create_yoy_growth_pivot(df_quarterly, start_date='2008-03', end_date='2025-03')

In [4]:
# DB Ï†ëÏÜç Ï†ïÎ≥¥ ÏÑ§Ï†ï
db_info = {
    'user': 'stox7412',         # Ïòà: 'root'
    'password': 'Apt106503!~', # Ïòà: '1234'
    'host': 'hystox74.synology.me',         # Ïòà: 'localhost' ÎòêÎäî IP
    'port': '3307',              # Í∏∞Î≥∏ Ìè¨Ìä∏Îäî Î≥¥ÌÜµ 3306
    'database': 'investar'        # Ïòà: 'trade_data'
}

fs_df = fetch_table_data(db_info, "Korea_FS_data")
fs_df.rename(columns={'Date': 'date'}, inplace=True)

# 1. indicator ÌïÑÌÑ∞ÎßÅ
target_indicator = 'Îß§Ï∂úÏï°(Ï≤úÏõê)'
filtered_df = fs_df[fs_df['indicator'] == target_indicator].copy()

# 2. ÎÇ†Ïßú Ï†ïÏ†ú Î∞è Ï†ïÎ†¨
filtered_df['date'] = pd.to_datetime(filtered_df['date'])
filtered_df.sort_values(by='date', inplace=True)

# 3. value Ïª¨ÎüºÏù¥ ÏûàÎäîÏßÄ ÌôïÏù∏ Î∞è ÌÉÄÏûÖ Í∞ïÏ†ú
if 'value' not in filtered_df.columns:
    raise KeyError("'value' Ïª¨ÎüºÏù¥ ÏóÜÏäµÎãàÎã§.")

filtered_df['value'] = pd.to_numeric(filtered_df['value'], errors='coerce')

# 4. ÌîºÎ≤ó ÌÖåÏù¥Î∏î ÏÉùÏÑ± (Ìñâ: date, Ïó¥: Symbol, Í∞í: value)
pivot_df = filtered_df.pivot_table(
    index='date',
    columns='Symbol',
    values='value',
    aggfunc='first'  # Ï§ëÎ≥µ Î∞©ÏßÄ
)

# 5. Ï†ÑÎÖÑ ÎèôÎ∂ÑÍ∏∞ ÎåÄÎπÑ Î≥ÄÌôîÏú® Í≥ÑÏÇ∞ (4Î∂ÑÍ∏∞ Ï†Ñ ÎåÄÎπÑ)
fs_yoy_growth_df = pivot_df.pct_change(periods=4) * 100

‚úÖ 'Korea_FS_data' ÌÖåÏù¥Î∏îÏóêÏÑú 537251Í±¥Ïùò Îç∞Ïù¥ÌÑ∞Î•º Í∞ÄÏ†∏ÏôîÏäµÎãàÎã§.


  fs_yoy_growth_df = pivot_df.pct_change(periods=4) * 100


In [5]:
correlation_result = calculate_correlation_between_dfs(
    fs_yoy_growth_df,
    quarterly_trade_data,
    start_date='2015-03-31',
    end_date='2025-03-31'
)

# ÏÉÅÏúÑ Î™á Í∞ú ÌôïÏù∏
correlation_result.head()

  X -= avg[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]


root_hs_code,121120,121221,151550,151590,170199,190230,190590,200599,200830,200899,...,903149,903180,903190,903289,940130,940199,940330,940540,950300,970191
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A000080,-0.203083,-0.12568,0.432674,-0.150449,0.14058,0.187691,0.137274,-0.008202,0.250289,0.646038,...,0.170516,-0.311499,-0.107542,-0.111817,-0.069895,0.055264,0.243781,-0.409647,-0.248206,0.208478
A000100,-0.272104,0.191272,-0.033654,-0.008584,0.14551,0.344208,-0.003791,0.205258,0.07667,-0.271464,...,-0.028939,0.165674,0.032395,0.203358,-0.091537,0.082916,-0.22399,-0.009093,-0.0116,-0.060364
A000120,0.012167,-0.087662,0.100831,0.066985,-0.495339,-0.116643,-0.34302,0.010813,-0.112004,-0.123431,...,-0.14577,0.100873,0.13556,0.14252,0.188131,0.582145,0.087016,0.340004,0.013623,-0.280244
A000150,0.083425,0.069001,0.044823,-0.087636,0.077243,0.073057,0.114927,-0.230696,-0.02533,-0.089083,...,0.070969,0.089904,-0.064571,-0.135937,0.042311,-0.797012,0.38341,0.009418,0.040064,0.731852
A000210,0.092882,0.065463,-0.217648,-0.002267,-0.057748,-0.181472,-0.263607,-0.382903,-0.164437,-0.682346,...,0.360745,0.261045,-0.032267,0.130797,-0.12861,-0.415848,0.429447,-0.472817,0.208341,0.721586


In [32]:
top_hs_codes = get_top_correlated_hscode(
    corr_matrix=correlation_result,  # Ïù¥Ï†ÑÏóê ÎßåÎì† ÏÉÅÍ¥ÄÍ¥ÄÍ≥Ñ ÌñâÎ†¨
    symbol='A051600',
    top_n=20,
    threshold=0.3  # ÏÑ†ÌÉùÏÇ¨Ìï≠
)

print(top_hs_codes)

   root_hs_code  correlation
0        300249     0.815665
1        970191     0.706812
2        854143     0.658273
3        294130     0.520792
4        854159     0.496898
5        853529     0.487111
6        870340     0.480725
7        845090     0.464730
8        620240     0.448979
9        870324     0.445493
10       842410     0.443128
11       300215     0.427555
12       950300     0.413712
13       870333     0.402147
14       870322     0.388791
15       870323     0.388301
16       610910     0.374557
17       630710     0.373197
18       902620     0.357883
19       681511     0.347502


In [34]:
top_symbols = get_top_correlated_symbols(
    corr_matrix=correlation_result,
    hs_code='854143',
    top_n=50,
    threshold=0.1  # ÏÑ†ÌÉùÏÇ¨Ìï≠
)

print(top_symbols)

     Symbol  correlation
0   A091810     0.984356
1   A247540     0.982537
2   A089590     0.981808
3   A066970     0.980794
4   A005850     0.975777
5   A023160     0.974569
6   A373220     0.971639
7   A028050     0.957922
8   A004370     0.956530
9   A114090     0.956050
10  A039130     0.954362
11  A272450     0.948988
12  A006360     0.945089
13  A018880     0.939133
14  A012330     0.938521
15  A316140     0.934411
16  A000270     0.933952
17  A280360     0.933695
18  A241560     0.933338
19  A005380     0.931006
20  A122870     0.924568
21  A005610     0.917304
22  A086790     0.912177
23  A024110     0.909354
24  A051910     0.908392
25  A006400     0.904835
26  A055550     0.903643
27  A034020     0.895021
28  A139130     0.894774
29  A248070     0.884445
30  A105560     0.884195
31  A138930     0.882672
32  A049770     0.882132
33  A064960     0.878298
34  A035420     0.876593
35  A175330     0.876397
36  A035250     0.872722
37  A189300     0.872482
38  A000150     0.869052
