In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text

In [2]:
path = r"C:\Users\82108\OneDrive\바탕 화면\investment\investment_strategy\DATA\KSE_FS_Dataguide.xlsx"

raw_df = pd.read_excel(path)

# 기준 행
row_header_1 = 8  # 항목명
row_header_2 = 9  # 단위, 코드, 분류 등

# 복합 컬럼명 생성
def combine_headers(col):
    item = str(raw_df.loc[row_header_1, col]) if pd.notna(raw_df.loc[row_header_1, col]) else ""
    unit = str(raw_df.loc[row_header_2, col]) if pd.notna(raw_df.loc[row_header_2, col]) else ""
    combined = f"{unit.strip()}: {item.strip()}" if unit and item else item or unit
    return combined if combined else col  # fallback

# 새로운 컬럼 리스트 생성
new_columns = [combine_headers(col) for col in raw_df.columns]

# 컬럼명 적용
df_cleaned = raw_df.copy()
df_cleaned.columns = new_columns
df_cleaned = df_cleaned.iloc[10:].reset_index(drop=True)

# "Name"을 "company_name"으로 변경
df_cleaned = df_cleaned.rename(columns={"Name": "company_name"})

# 확인
print(df_cleaned.columns.tolist())

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\82108\\OneDrive\\바탕 화면\\investment\\investment_strategy\\DATA\\KSE_FS_Dataguide.xlsx'

In [47]:
# ======================
# 1. 주요 지표 컬럼 이름 찾기
# ======================
def find_column(df, keyword):
    for col in df.columns:
        if keyword in col and '(천원)' in col:
            return col
    return None

# 주요 항목들 컬럼명 식별
col_sales = find_column(df_cleaned, '매출액')
col_gross = find_column(df_cleaned, '매출총이익')
col_operating = find_column(df_cleaned, '영업이익')
col_continuing = find_column(df_cleaned, '계속사업이익')
col_net_income = find_column(df_cleaned, '당기순이익')
col_noncurrent_liab = find_column(df_cleaned, '비유동부채')
col_equity = None
for col in df_cleaned.columns:
    if "지배" in col and "지분" in col:
        col_equity = col
        break

# ======================
# 2. 숫자형으로 변환
# ======================
for col in [col_sales, col_gross, col_operating, col_continuing, col_net_income, col_noncurrent_liab, col_equity]:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

# ======================
# 3. sort와 groupby 이용하여 YoY 증가율 계산
# ======================
df_cleaned = df_cleaned.sort_values(by=["Symbol", "회계년", "주기"])

# 증가율 계산 함수
def calc_yoy(df, col):
    return df[col].pct_change(periods=4)

# YoY 증가율 계산 (4분기 전과 비교)
df_cleaned["YoY_매출액"] = df_cleaned.groupby("Symbol")[col_sales].pct_change(periods=4, fill_method=None)
df_cleaned["YoY_매출총이익"] = df_cleaned.groupby("Symbol")[col_gross].pct_change(periods=4, fill_method=None)
df_cleaned["YoY_영업이익"] = df_cleaned.groupby("Symbol")[col_operating].pct_change(periods=4, fill_method=None)
df_cleaned["YoY_계속사업이익"] = df_cleaned.groupby("Symbol")[col_continuing].pct_change(periods=4, fill_method=None)
df_cleaned["YoY_당기순이익"] = df_cleaned.groupby("Symbol")[col_net_income].pct_change(periods=4, fill_method=None)

# ======================
# 4. 수익성 비율 계산
# ======================
# 수익성 비율
df_cleaned["매출총이익률"] = df_cleaned[col_gross] / df_cleaned[col_sales]
df_cleaned["영업이익률"] = df_cleaned[col_operating] / df_cleaned[col_sales]
df_cleaned["순이익률"] = df_cleaned[col_net_income] / df_cleaned[col_sales]

# 재무 안정성 비율
df_cleaned["비유동부채비율"] = df_cleaned[col_noncurrent_liab] / df_cleaned[col_equity]

# ======================
# 결과 미리보기
# ======================
display_cols = [
    "Symbol", "회계년", "주기", col_sales,
    "YoY_매출액", "YoY_영업이익", "영업이익률", "순이익률", "비유동부채비율"
]
print(df_cleaned[display_cols].dropna().head(10))

        Symbol   회계년  주기      매출액(천원)   YoY_매출액  YoY_영업이익     영업이익률      순이익률  \
14788  A000080  2005  1Q  178510186.0  0.119081  0.534541  0.328599  0.248931   
14789  A000080  2005  2Q  178139523.0  0.090801  0.311304  0.312639  0.994690   
14790  A000080  2005  3Q  185444855.0 -0.006677 -0.385215  0.206434  1.251166   
14791  A000080  2005  4Q  187615442.0  0.022223  0.143054  0.308197  0.312691   
14792  A000080  2006  1Q  180254189.0  0.009770 -0.371722  0.204454  0.143913   
14793  A000080  2006  2Q  174980431.0 -0.017734 -0.356911  0.204685  0.398813   
14794  A000080  2006  3Q  165448485.0 -0.107829 -0.365278  0.146865  0.130170   
14795  A000080  2006  4Q  180111181.0 -0.039998 -0.551457  0.143999 -0.007284   
14796  A000080  2007  1Q  165170484.0 -0.083680 -0.112984  0.197916  0.158397   
14797  A000080  2007  2Q  158097052.0 -0.096487 -0.121034  0.199124  0.153641   

        비유동부채비율  
14788 -1.254256  
14789 -1.290585  
14790  3.276397  
14791  2.658431  
14792  1.983518  


In [60]:
from calendar import monthrange

def create_date_column(df):
    # 각 월의 마지막 날짜 계산 함수
    def get_last_day_of_month(year, month):
        return monthrange(int(year), int(month))[1]

    # 날짜 열 생성
    df['Date'] = df.apply(
        lambda row: pd.to_datetime(
            f"{int(row['회계년'])}-{int(row['결산월'])}-{get_last_day_of_month(row['회계년'], row['결산월'])}"
        ),
        axis=1
    )

    # 주기가 분기형이면, 날짜를 분기 말일로 보정
    if '주기' in df.columns:
        quarterly_mask = df['주기'].astype(str).str.contains("Q")
        for idx in df[quarterly_mask].index:
            base_date = df.loc[idx, 'Date']
            df.loc[idx, 'Date'] = pd.date_range(end=base_date, periods=4, freq='3ME')[-1]

    return df

In [61]:
df_cleaned = create_date_column(df_cleaned)
df_cleaned

Unnamed: 0,Symbol,company_name,결산월,회계년,주기,매출액(천원),매출총이익(천원),영업이익(천원),계속사업이익(천원),당기순이익(천원),...,YoY_매출액,YoY_매출총이익,YoY_영업이익,YoY_계속사업이익,YoY_당기순이익,매출총이익률,영업이익률,순이익률,비유동부채비율,Date
14784,A000080,하이트진로,12,2004,1Q,1.595150e+08,71046357.00,38225258.00,0.000000e+00,5.270310e+05,...,,,,,,0.445390,0.239634,0.003304,-1.098554,2004-12-31
14785,A000080,하이트진로,3,2004,2Q,1.633107e+08,76477002.00,42471708.00,-9.903940e+08,-9.903940e+08,...,,,,,,0.468291,0.260067,-6.064475,-1.230107,2004-03-31
14786,A000080,하이트진로,6,2004,3Q,1.866913e+08,93642775.00,62269166.00,1.261560e+08,1.261560e+08,...,,,,,,0.501591,0.333541,0.675746,-1.291410,2004-06-30
14787,A000080,하이트진로,9,2004,4Q,1.835367e+08,90334442.00,50586054.00,8.642380e+08,-3.884177e+08,...,,,,,,0.492187,0.275618,-2.116295,-1.266575,2004-09-30
14788,A000080,하이트진로,3,2005,1Q,1.785102e+08,86502963.00,58658222.00,0.000000e+00,4.443663e+07,...,0.119081,0.217557,0.534541,,83.315021,0.484583,0.328599,0.248931,-1.254256,2005-03-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20235,A475150,SK이터닉스,12,2024,4Q,1.530490e+08,33452026.38,25147515.16,1.977347e+07,1.977347e+07,...,,,,,,0.218571,0.164310,0.129197,0.796187,2024-12-31
20236,A475150,SK이터닉스,3,2025,1Q,2.593315e+07,5867003.05,1062016.29,9.793571e+05,9.793571e+05,...,4.384762,3.256739,1.543975,-1.674839,-1.674839,0.226236,0.040952,0.037765,0.656433,2025-03-31
20237,A475150,SK이터닉스,6,2025,2Q,,,,,,...,,,,,,,,,,2025-06-30
20238,A475150,SK이터닉스,9,2025,3Q,,,,,,...,,,,,,,,,,2025-09-30


In [56]:
from calendar import monthrange

def create_date_column(df):
    # 각 월의 마지막 날짜 계산 함수
    def get_last_day_of_month(year, month):
        return monthrange(int(year), int(month))[1]

    # 날짜 열 생성
    df['Date'] = df.apply(
        lambda row: pd.to_datetime(
            f"{int(row['회계년'])}-{int(row['결산월'])}-{get_last_day_of_month(row['회계년'], row['결산월'])}"
        ),
        axis=1
    )

    # 주기가 분기형이면, 날짜를 분기 말일로 보정 (선택적 처리)
    if '주기' in df.columns:
        quarterly_mask = df['주기'].astype(str).str.contains("Q")
        for idx in df[quarterly_mask].index:
            base_date = df.loc[idx, 'Date']
            df.loc[idx, 'Date'] = pd.date_range(end=base_date, periods=4, freq='3M')[-1]

    return df


In [62]:
df_cleaned = create_date_column(df_cleaned)

In [76]:
def convert_to_long_format(df):
    # 제거할 열
    drop_cols = ["결산월", "회계년", "주기"]

    # ID 변수 (고정값 유지할 열들)
    id_vars = ["Symbol", "company_name", "Date"]

    # 나머지는 전부 indicator 대상 열로 melt 처리
    value_vars = [col for col in df.columns if col not in id_vars + drop_cols]

    # melt 실행
    df_long = pd.melt(df,
                      id_vars=id_vars,
                      value_vars=value_vars,
                      var_name="indicator",
                      value_name="value")

    return df_long

In [78]:
df_long = convert_to_long_format(df_cleaned)

# inf → NaN, 그 뒤 객체형 변환 보정
df_long = df_long.replace([np.inf, -np.inf], np.nan).infer_objects(copy=False)

# 또는 특정 열만 안전하게 처리
df_long["value"] = pd.to_numeric(df_long["value"], errors="coerce")
df_long = df_long.dropna(subset=["value"])


df_long

  df_long = df_long.replace([np.inf, -np.inf], np.nan).infer_objects(copy=False)


Unnamed: 0,Symbol,company_name,Date,indicator,value
0,A000080,하이트진로,2004-12-31,매출액(천원),1.595150e+08
1,A000080,하이트진로,2004-03-31,매출액(천원),1.633107e+08
2,A000080,하이트진로,2004-06-30,매출액(천원),1.866913e+08
3,A000080,하이트진로,2004-09-30,매출액(천원),1.835367e+08
4,A000080,하이트진로,2005-03-31,매출액(천원),1.785102e+08
...,...,...,...,...,...
768232,A475150,SK이터닉스,2024-03-31,비유동부채비율,9.776132e-01
768233,A475150,SK이터닉스,2024-06-30,비유동부채비율,9.372560e-01
768234,A475150,SK이터닉스,2024-09-30,비유동부채비율,9.005042e-01
768235,A475150,SK이터닉스,2024-12-31,비유동부채비율,7.961872e-01


In [79]:
def upload_fs_data_to_db(df_long, db_info, table_name="Korea_FS_data"):
    """
    DB에 long format 재무 데이터를 업로드하며 중복은 제거함.

    Parameters:
        df_long (pd.DataFrame): 업로드 대상 데이터프레임
        db_info (dict): DB 연결정보 {'user':, 'password':, 'host':, 'port':, 'database':}
        table_name (str): 업로드할 테이블명 (기본값: 'Korea_FS_data')
    """

    # SQLAlchemy 엔진 생성
    engine = create_engine(
        f"mysql+pymysql://{db_info['user']}:{db_info['password']}@{db_info['host']}:{db_info['port']}/{db_info['database']}"
    )

    # 기존 데이터 불러오기
    try:
        query = f"SELECT Symbol, Date, company_name FROM {table_name}"
        existing = pd.read_sql(query, engine)
        existing["Date"] = pd.to_datetime(existing["Date"])
    except Exception as e:
        print("⚠️ 기존 테이블이 없거나 조회 실패 → 신규 테이블로 간주:", e)
        existing = pd.DataFrame(columns=["Symbol", "Date", "company_name"])

    # 날짜 형식 통일
    df_long["Date"] = pd.to_datetime(df_long["Date"])

    # 기준 key로 중복 제거
    merge_keys = ["Symbol", "Date", "company_name"]
    merged = pd.merge(df_long, existing, on=merge_keys, how="left", indicator=True)
    df_new = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])

    # 신규 데이터 존재 시만 업로드
    if not df_new.empty:
        df_new.to_sql(name=table_name, con=engine, index=False, if_exists='append')
        print(f"✅ 신규 {len(df_new)}개 row 업로드 완료")
    else:
        print("✅ 업로드할 신규 데이터가 없습니다.")

In [80]:
db_info = {
    "user": 'stox7412',
    "password": 'Apt106503!~',
    "host": '192.168.0.230',
    "port": 3307,
    "database": "investar"
}

upload_fs_data_to_db(df_long, db_info)

✅ 업로드할 신규 데이터가 없습니다.
