# 04. 재무제표 API 이용해보기

### dart-fss를 사용하기 위한 세팅 후, 기업 코드 불러오기

In [1]:
!pip install dart-fss -q

In [2]:
# pandas와 dart_fss모듈을 import합니다.
import pandas as pd
import dart_fss

from google.colab import drive
import os

from datetime import datetime

### dart-fss를 사용하기 위한 세팅 후, 기업 코드 불러오기

In [3]:
API_KEY = '69db60f6b16e7a7e91ae38ced61b62c7f914f789'
FS_DAY = '재무제표기준일'
REPORT_DAY = '레포트기준일'

In [4]:
dart_fss.set_api_key(api_key=API_KEY)

corp_list = dart_fss.corp.get_corp_list()
# 회사 이름으로 기업 코드 찾기
corp_code = corp_list.find_by_corp_name('삼성전자', exactly=True)[0].corp_code
corp_code



'00126380'

### 전자공시 시스템 DART에서 분기별 보고서 다운로드

In [5]:
# 현재 작업경로 확인
print(os.getcwd())

# drive모듈을 이용해서 드라이브 마운트하기
drive.mount('/content/drive')

# 다운받은 보고서 파일을 저장할 경로 지정
path = "/content/drive/MyDrive/temp2"

/content
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### DART의 재무제표 추출 메서드로 재무제표 다운로드 후 저장하고 로드하기

In [6]:
# 분기 키워드를 리스트로 저장, 이후 순회하며 사용
periods = ['annual', 'half', 'quarter']

# 주기별 보고서를 다운받아 로컬에 저장 후 dataframe으로 반환
def dir_exists(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path, exist_ok=True)

def save_report_if_not_exists(corp_code, period, start_date, dir_path):
    file_name = f"{corp_code}_{period}_{start_date[:4]}.xlsx"
    file_path = os.path.join(dir_path, file_name)

    if not os.path.exists(file_path):
        report = dart_fss.fs.extract(corp_code, start_date, separate=False, report_tp=[period])
        report.save(file_name, dir_path)

    return file_path

def download_reports(corp_code, start_date, dir_path):
    reports = {}
    dir_exists(dir_path)

    for period in periods:
        file_path = save_report_if_not_exists(corp_code, period, start_date, dir_path)
        reports[period] = pd.read_excel(file_path, sheet_name=None)

    return reports

In [7]:
start_yr = 2021
start_date = str(start_yr) + '0101'
fs_reports = download_reports(corp_code, start_date, path)

### 엑셀에 포함되어 있던 여러 시트 중 필요한 시트만 추려내기

In [8]:
# 보고서 데이터프레임에서 필요한 데이터가 있는 시트만 추려내는 함수
def extract_data_sheets(reports):
  # 매개변수 reports는 각 분기의 레포트를 받음(fs_df['annual'])
  return{
      # 필요한 시트를 딕셔너리로 반환 {보고서이름: 보고서내용}
      sheet_name[-2:]: sheet_data
      for sheet_name, sheet_data in reports.items()
      if sheet_name.startswith('Data') and sheet_name[-3] == '_'
  }

In [9]:
# 필요한 시트를 분기 키워드에 맞게 딕셔너리로 반환
def get_fs_dict(fs_reports):
  return {
      # 분기별 시트를 딕셔너리로 반환 {분기: 보고서딕셔너리}
      period: extract_data_sheets(reports)
      for period, reports in fs_reports.items()
  }

In [10]:
# 분기별 재무제표 데이터를 딕셔너리로 정리
fs_df = get_fs_dict(fs_reports)

# 호출 방법 (bs: 재무상태표, is: 손익계산서, cf: 현금흐름표)
fs_df['annual']['bs'].head()

Unnamed: 0.1,Unnamed: 0,"[D210000] 재무상태표, 유동/비유동법 - 연결 | Statement of financial position, current/non-current - Consolidated financial statements (Unit: KRW)",Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,20231231,20221231,20211231,20201231,20191231,20181231
0,,concept_id,label_ko,label_en,class0,class1,class2,class3,class4,"('연결재무제표',)","('연결재무제표',)","('연결재무제표',)","('연결재무제표',)","('연결재무제표',)","('연결재무제표',)"
1,,,,,,,,,,,,,,,
2,0.0,ifrs-full_CurrentAssets,유동자산,Current assets,ifrs-full:StatementOfFinancialPositionAbstract,자산 [개요],유동자산,,,195936557000000,218470581000000,218163185000000,198215579000000,181385260000000,174697424000000
3,1.0,ifrs-full_CashAndCashEquivalents,현금및현금성자산,Cash and cash equivalents,ifrs-full:StatementOfFinancialPositionAbstract,자산 [개요],유동자산,현금및현금성자산,,69080893000000,49680710000000,39031415000000,29382578000000,26885999000000,30340505000000
4,2.0,ifrs-full_ShorttermDepositsNotClassifiedAsCash...,단기금융상품,Short-term financial instruments,ifrs-full:StatementOfFinancialPositionAbstract,자산 [개요],유동자산,단기금융상품,,22690924000000,65102886000000,81708986000000,92441703000000,76252052000000,65893797000000


### 데이터프레임의 인덱스와 컬럼 정리하기

In [11]:
# 재무제표 타입마다 다른 라벨의 수를 구하는 함수.
def get_last_label_position(col_list):
  end_idx, bef = 0, 0
  for i in range(1, len(col_list)):
    front_letters = col_list[i][:2]
    if ('Un' == front_letters) and bef == 0:
      end_idx += 1
      bef = 1
    elif ('Un' == front_letters) and bef == 1:
      end_idx += 1
    elif ('Un' != front_letters) and bef == 0:
      end_idx += 1
      continue
    else:
      break
  return end_idx

In [12]:
def get_last_label_position(col_list):
  last_idx = 0
  for i in range(1, len(col_list)):
    if col_list[i][:2] == 'Un':
      last_idx += 1
    elif last_idx > 0:
      # 이미 'Un'을 만난 후 다른 값을 만나면 중단
      break
    else:
      last_idx += 1
  return last_idx

In [13]:
# 불필요한 라벨 삭제, 행열 전치
def arrange_labels(df):
  last_label_idx = get_last_label_position(df.columns)

  # 칼럼 이름 바꾸기
  df.rename(columns={df.columns[idx]: df.iloc[0, idx] for idx in range(last_label_idx + 1)}, inplace=True)

  # 라벨의 라벨 역할을 하는 0, 1번째 행 삭제
  df.drop([0,1], axis=0, inplace=True)

  # 사용하지 않는 라벨 칼럼 삭제
  df.drop(df.columns[0:2], axis=1, inplace=True)
  df.drop(df.columns[1:last_label_idx-1], axis=1, inplace=True)

  # 인덱스 설정 및 전치
  df.set_index(df.columns[0], inplace=True)
  df = df.transpose()

  return df

In [14]:
fs_types = fs_df['annual'].keys()

for period in periods:
  for fs_type in fs_types:
    fs_df[period][fs_type] = arrange_labels(fs_df[period][fs_type])

### 재무제표 종류별로 필요한 데이터만 필터링

In [15]:
# 재무제표별로 필요한 데이터를 리스트로 보관
type_labels = {'bs': ['자산총계', '부채총계', '자본총계', '유동자산', '비유동자산', '유동부채', '비유동부채'],
               'is_annual': ['영업수익', '매출총이익', '영업이익', '당기순이익(손실)'],
               'is_half': ['수익(매출액)', '매출총이익', '영업이익', '당기순이익(손실)'],
               'is_quarter': ['매출액', '매출총이익', '영업이익'],
               'cf': ['영업활동현금흐름', '투자활동현금흐름', '재무활동현금흐름'],
               'cf_': ['영업활동 현금흐름', '투자활동 현금흐름', '재무활동 현금흐름']}

In [16]:
for period in periods:
  for fs_type in fs_types:
    if (fs_type=='cf') and not (type_labels['cf'][0] in fs_df[period][fs_type].columns):
      # 지표에 공백이 들어간 현금흐름표의 경우 알맞은 지표로 필터링
      fs_df[period][fs_type] = fs_df[period][fs_type].loc[:, type_labels['cf_']]
    elif (fs_type=='is'):
      type_key = 'is_' + period
      fs_df[period][fs_type] = fs_df[period][fs_type].loc[:, type_labels[type_key]]
    else:
      fs_df[period][fs_type] = fs_df[period][fs_type].loc[:, type_labels[fs_type]]

In [17]:
# 손익계산서에서 주기마다 다른 '영업수익'의 이름 통일
fs_df['half']['is'].rename(columns={'수익(매출액)':'영업수익'}, inplace=True)
fs_df['quarter']['is'].rename(columns={'매출액':'영업수익'}, inplace=True)

### 시계열 피처 만들기

In [18]:
# 재무제표 데이터의 기준일을 index에서 시계열 피처로
for period in periods:
  for fs_type in fs_types:
    fs_df[period][fs_type].reset_index(inplace=True)
    fs_df[period][fs_type].rename(columns={'index':REPORT_DAY}, inplace=True)

### 분기별로, 재무제표별로 나눠진 데이터셋을 하나로 병합하기 위한 기초 데이터프레임 만들기

In [19]:
# 지정된 기간동안의 재무제표 데이터에 대한 기준날짜를 시계열 피처로 하는 빈 데이터프레임 생성
def create_quarter_df(start_yr):
  start_yr_of_data = start_yr - 1
  current_year = datetime.now().year
  current_month = datetime.now().month
  terms_of_yr = ['0331', '0630', '0930', '1231']
  terms_list = []

  terms_list.append(pd.to_datetime(str(start_yr_of_data)+'1231'))
  for yr in range(start_yr_of_data + 1, current_year + 1):
    for term in terms_of_yr:
      if (yr < current_year or (yr == current_year and int(term[:2]) < current_month - 3)):
        terms_list.append(pd.to_datetime(str(yr) + term))
      elif (yr == current_year):
        break

  df = pd.DataFrame({FS_DAY: terms_list})
  return df

In [20]:
df = create_quarter_df(start_yr)

### 각 재무제표마다 지정된 년도 이후의 데이터만 남기기

In [21]:
def get_year(x):
  return x.year

for period in periods:
  is_valid = pd.to_datetime(fs_df[period]['bs'][REPORT_DAY]).apply(get_year) >= start_yr
  if (period == 'annual'):
    is_valid = pd.to_datetime(fs_df[period]['bs'][REPORT_DAY]).apply(get_year) >= (start_yr - 1)
  fs_df[period]['bs'] = fs_df[period]['bs'][is_valid]
  fs_df[period]['bs'].insert(0, FS_DAY, pd.to_datetime(fs_df[period]['bs'][REPORT_DAY]))

### 기초 데이터프레임에 BS 데이터 병합하기

In [22]:
# 1, 3, 4분기 bs데이터 from quarter 병합
df = pd.merge(df, fs_df['quarter']['bs'], on=FS_DAY, how='outer')

# 4분기 bs데이터 from annual 병합 후 중복 삭제
df = pd.merge(df, fs_df['annual']['bs'], how='outer').sort_values(FS_DAY)
df = df.drop_duplicates(subset=FS_DAY, keep='last')

# 인덱스 리셋
df = df.reset_index(drop=True)

# 2분기 bs데이터 from half 병합 => 2분기 날짜가 중복생성됨
df = pd.merge(df, fs_df['half']['bs'].sort_values(FS_DAY).reset_index(drop=True), how='outer').sort_values(FS_DAY)

# 중복 날짜 중 새로 추가된 데이터를 남김
df = df.drop_duplicates(subset=FS_DAY, keep='last')

# 인덱스 리셋
df = df.reset_index(drop=True)

# 레포트 기준일 삭제
df = df.drop(REPORT_DAY, axis=1)

### IS 데이터 병합하기

In [23]:
def get_start_month(x):
  return x[5]

def get_end_date(x):
  return x[9:]

def get_partial_fs(fs_df, period, report_type):
  # 손익계산서 기준기간이 1월 1일부터인 항목만 필터링
  is_from_jan = (fs_df[period][report_type][REPORT_DAY].apply(get_start_month)=='1')
  partial_is = fs_df[period][report_type][is_from_jan]

  # 기준기간의 종료일(=레포트기준일)만 찾아서 재무제표기준일 피처로
  partial_is.insert(0, FS_DAY, pd.to_datetime(partial_is[REPORT_DAY].apply(get_end_date)))

  # 재무제표기준일이 분석 시작년도 이후인 데이터만 필터링
  year_to_get = start_yr
  if (period == 'annual'):
    year_to_get = start_yr -1
  partial_is = partial_is[partial_is[FS_DAY].apply(get_year) >= year_to_get]

  return partial_is

In [24]:
is_13 = get_partial_fs(fs_df, 'quarter', 'is')
is_2 = get_partial_fs(fs_df, 'half', 'is')
is_4 = get_partial_fs(fs_df, 'annual', 'is')

# 각 분기 IS 데이터 병합, 정렬, 재인덱스
is_combined = pd.concat([is_13, is_2, is_4]).sort_values(FS_DAY).reset_index(drop=True)
is_combined = is_combined.drop(REPORT_DAY, axis=1)
is_combined

df = pd.merge(df, is_combined, how='outer')

### CF 데이터 병합하기

In [25]:
# 반기 보고서만 피처이름이 다른 것 보정
fs_df['half']['cf'].rename(columns={'영업활동 현금흐름':'영업활동현금흐름',
                                    '투자활동 현금흐름':'투자활동현금흐름',
                                    '재무활동 현금흐름':'재무활동현금흐름'}, inplace=True)

In [26]:
def get_partial_cf(period, fs_df):
  # 손익계산서 기준기간이 1월 1일부터인 항목만 필터링
  cf_from_jan = (fs_df[period]['cf'][REPORT_DAY].apply(get_start_month)=='1')
  partial_cf = fs_df[period]['cf'][cf_from_jan]

  # 기준기간의 종료일(=레포트기준일)만 찾아서 재무제표기준일 피처로
  partial_cf.insert(0, FS_DAY, pd.to_datetime(partial_cf[REPORT_DAY].apply(get_end_date)))

  year_to_get = start_yr
  if (period == 'annual'):
    year_to_get = start_yr -1
  partial_cf = partial_cf[partial_cf[FS_DAY].apply(get_year) >= year_to_get]

  return partial_cf

In [27]:
cf_13 = get_partial_fs(fs_df, 'quarter', 'cf').sort_values(FS_DAY)
cf_2 = get_partial_fs(fs_df, 'half', 'cf').sort_values(FS_DAY)
cf_4 = get_partial_fs(fs_df, 'annual', 'cf').sort_values(FS_DAY)

# 각 분기 CF 데이터 병합, 정렬, 재인덱스
cf_combined = pd.concat([cf_13, cf_2, cf_4]).sort_values(FS_DAY).reset_index(drop=True)
cf_combined = cf_combined.drop(REPORT_DAY, axis=1)
cf_combined

df = pd.merge(df, cf_combined, how='outer')

### 결측치 처리하기: 당기순이익

In [28]:
fs_reports = download_reports(corp_code, start_date, path)

quarter_is = fs_reports['quarter']['Data_is'].copy()

# 컬럼 이름 바꾸기
last_label = 6
for idx in range(0, last_label):
  old_nm = quarter_is.columns[idx]
  new_nm = quarter_is.iloc[0, idx]
  quarter_is.rename(columns={old_nm:new_nm}, inplace=True)

# 0, 1번째 행 삭제
quarter_is = quarter_is.iloc[2:]

# 라벨 칼럼 삭제
quarter_is = quarter_is.iloc[:, 2:]
quarter_is = quarter_is.drop(quarter_is.columns[1:last_label-1], axis=1)

# 라벨을 인덱스로해서 행열전환
quarter_is.set_index('label_ko', inplace=True)
quarter_is = quarter_is.transpose()

quarter_is = quarter_is.loc[:, ['법인세비용차감전순이익(손실)', '법인세비용(수익)']]
quarter_is.reset_index(inplace=True)

 # 손익계산서 기준기간이 1월 1일부터인 항목만 필터링
is_from_jan = (quarter_is['index'].apply(get_start_month)=='1')
quarter_is_partial = quarter_is[is_from_jan]

# 기준기간의 종료일(=레포트기준일)만 찾아서 재무제표기준일 피처로
quarter_is_partial.insert(0, FS_DAY, pd.to_datetime(quarter_is_partial['index'].apply(get_end_date)))
quarter_is_partial.drop(['index'], axis=1, inplace=True)

# 재무제표기준일이 분석 시작년도 이후인 데이터만 필터링
quarter_is_partial = quarter_is_partial[quarter_is_partial[FS_DAY].apply(get_year) >= start_yr]

quarter_is_partial['당기순이익(손실)'] = quarter_is_partial['법인세비용차감전순이익(손실)'] - quarter_is_partial['법인세비용(수익)']
quarter_is_partial.drop(['법인세비용차감전순이익(손실)', '법인세비용(수익)'], axis=1, inplace=True)

# 당기순이익을 채울 데이터프레임을 재무제표기준일 순으로 정렬한 후
quarter_is_partial.sort_values(FS_DAY, inplace=True)

# df의 '당기순이익(손실)'열에 결측치가 있는 행의 index를 가져와서 re-index한다.
quarter_is_partial = quarter_is_partial.set_index(df[df['당기순이익(손실)'].isna()].index, drop=True)

# 기존 데이터프레임의 결측치 채우기
# 결측치를 채운다.
df['당기순이익(손실)'] = df['당기순이익(손실)'].fillna(quarter_is_partial['당기순이익(손실)'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quarter_is_partial.drop(['index'], axis=1, inplace=True)


### 결측치 처리하기: 비어있는 특정일자

In [29]:
# 결측치가 있는 행들의 인덱스를 리스트로 반환하고, 결측치가 있는 칼럼을 출력하는 함수
def get_na_idx(df):
  # na값 찾기
  na_values = df.isna()

  # na값이 있는 칼럼 찾아서 출력: 어떤 항목을 계산해야하는지 파악하기 위함
  cols_with_na = na_values.any(axis=0)
  cols_with_na_names = df.columns[cols_with_na].tolist()
  print("결측치가 있는 칼럼:", cols_with_na_names)

  # na값이 있는 행의 index 찾아서 리스트로 반환
  rows_with_na = na_values.any(axis=1)
  rows_with_na_indices = df[rows_with_na].index.tolist()
  return rows_with_na_indices

In [30]:
# 총계항목(whole)에 대한 특정부분(part) 비율의 평균을 구해 총계항목의 값을 채우는 함수
def fill_whole_w_mean(df, na_idx, part, rest, whole):
  mean_ratio = (df[part] / df[whole]).mean()
  df[whole][na_idx] = round(df[part][na_idx] / mean_ratio)
  df[rest][na_idx] = df[whole][na_idx] - df[part][na_idx]

# 총계항목(whole)에 대한 특정부분(part) 비율의 평균을 구해 특정부분의 값을 채우는 함수
def fill_part_w_mean(df, na_idx, part, rest, whole):
  mean_ratio = (df[part] / df[whole]).mean()
  df[part][na_idx] = round(df[whole][na_idx] / mean_ratio)
  df[rest][na_idx] = df[whole][na_idx] - df[part][na_idx]

# 결측치가 있는 행의 인덱스를 parameter로 받아 결측치를 채우는 함수
def fill_bs_na(df, na_idx):
  # 자본총계에 대한 자산총계 비율의 평균값으로 결측치 채우기
  fill_whole_w_mean(df, na_idx, '자본총계', '부채총계', '자산총계')

  # 유동자산에 대한 자산총계 비율의 평균값으로 결측치 채우기
  fill_part_w_mean(df, na_idx, '유동자산', '비유동자산', '자산총계')

  # 유동부채에 대한 부채총계 비율의 평균값으로 결측치 채우기
  fill_part_w_mean(df, na_idx, '유동부채', '비유동부채', '부채총계')

In [31]:
# 결측치가 있는 행들에 대해 작성해둔 함수 적용
na_idxs = get_na_idx(df)
for idx in na_idxs:
  fill_bs_na(df, idx)

결측치가 있는 칼럼: ['자산총계', '부채총계', '유동자산', '비유동자산', '유동부채', '비유동부채']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[whole][na_idx] = round(df[part][na_idx] / mean_ratio)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[rest][na_idx] = df[whole][na_idx] - df[part][na_idx]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[part][na_idx] = round(df[whole][na_idx] / mean_ratio)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [33]:
df

Unnamed: 0,재무제표기준일,자산총계,부채총계,자본총계,유동자산,비유동자산,유동부채,비유동부채,영업수익,매출총이익,영업이익,당기순이익(손실),영업활동현금흐름,투자활동현금흐름,재무활동현금흐름
0,2020-12-31,378235718000000,102287702000000,275948016000000,198215579000000,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,26407832000000,65287009000000,-53628591000000,-8327839000000
1,2021-03-31,392826274000000,118557683000000,274268591000000,209155353000000,183670921000000,90109462000000,28448221000000,65388503000000,23888518000000,9382868000000,12359605000000,13803078000000,-1577609000000,-594323000000
2,2021-06-30,384777669000000,102453403000000,282324266000000,191118524000000,193659145000000,72461451000000,29991952000000,129060088000000,50494172000000,21949613000000,16776171000000,25889535000000,-5746952000000,-19642937000000
3,2021-09-30,410420718000000,113654600000000,296766118000000,212793019000000,197627699000000,81871964000000,31782636000000,203039275000000,81574488000000,37767144000000,48908386000000,44470987000000,-22696442000000,-20190702000000
4,2021-12-31,426621158000000,121721227000000,304899931000000,218163185000000,208457973000000,88117133000000,33604094000000,279604799000000,113193457000000,51633856000000,39907450000000,65105448000000,-33047763000000,-23991033000000
5,2022-03-31,439326959000000,124036040000000,315290919000000,232369082000000,206957877000000,90463701000000,33572339000000,77781498000000,30709441000000,14121409000000,18815052000000,10453069000000,-636511000000,-495346000000
6,2022-06-30,448040650000000,120133986000000,327906664000000,236287491000000,211753159000000,83362268000000,36771718000000,154985105000000,61643300000000,28218454000000,22423433000000,24589135000000,-19929267000000,-6457643000000
7,2022-09-30,470278409000000,125371520000000,344906889000000,250880637000000,219397772000000,85285669000000,40085851000000,231766785000000,90352743000000,39070499000000,31812631000000,43568446000000,-27952827000000,-14684755000000
8,2022-12-31,448424507000000,93674903000000,354749604000000,218470581000000,229953926000000,78344852000000,15330051000000,302231360000000,112189590000000,43376630000000,55654077000000,62181346000000,-31602804000000,-19390049000000
9,2023-03-31,479103881232172,119304465232172,359799416000000,966806340408729,-487702459176557,157527623748813,-38223158516641,63745371000000,17738278000000,640178000000,1574600000000,6291774000000,16471470000000,-979799000000


# 05. 주가 데이터 라이브러리 이용해보기

In [34]:
!pip install finance-datareader -q

In [35]:
import FinanceDataReader as fdr

### DART에서 기업의 주식종목코드 가져온 후 주가데이터 가져오기

In [37]:
start_date

'20210101'

In [36]:
stock_code = corp_list.find_by_corp_name('삼성전자', exactly=True)[0].stock_code
samsung_stock_history = fdr.DataReader(stock_code, start=start_date)
samsung_stock_history.reset_index(inplace=True)
samsung_stock_history

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change
0,2021-01-04,81000,84400,80200,83000,38655276,0.024691
1,2021-01-05,81600,83900,81600,83900,35335669,0.010843
2,2021-01-06,83300,84500,82100,82200,42089013,-0.020262
3,2021-01-07,82800,84200,82700,82900,32644642,0.008516
4,2021-01-08,83300,90000,83000,88800,59013307,0.071170
...,...,...,...,...,...,...,...
874,2024-07-19,85600,86100,84100,84400,18569122,-0.028769
875,2024-07-22,84400,84900,82600,83000,18987560,-0.016588
876,2024-07-23,84200,84700,83400,83900,15766389,0.010843
877,2024-07-24,82900,83300,81900,82000,16939083,-0.022646


# 06. 불러온 재무제표와 주가데이터를 하나의 데이터셋으로 만들어 구글 드라이브에 저장하기

### 주가 데이터와 재무제표 데이터 merge 하기

In [42]:
df

Unnamed: 0,재무제표기준일,자산총계,부채총계,자본총계,유동자산,비유동자산,유동부채,비유동부채,영업수익,매출총이익,영업이익,당기순이익(손실),영업활동현금흐름,투자활동현금흐름,재무활동현금흐름
0,2020-12-31,378235718000000,102287702000000,275948016000000,198215579000000,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,26407832000000,65287009000000,-53628591000000,-8327839000000
1,2021-03-31,392826274000000,118557683000000,274268591000000,209155353000000,183670921000000,90109462000000,28448221000000,65388503000000,23888518000000,9382868000000,12359605000000,13803078000000,-1577609000000,-594323000000
2,2021-06-30,384777669000000,102453403000000,282324266000000,191118524000000,193659145000000,72461451000000,29991952000000,129060088000000,50494172000000,21949613000000,16776171000000,25889535000000,-5746952000000,-19642937000000
3,2021-09-30,410420718000000,113654600000000,296766118000000,212793019000000,197627699000000,81871964000000,31782636000000,203039275000000,81574488000000,37767144000000,48908386000000,44470987000000,-22696442000000,-20190702000000
4,2021-12-31,426621158000000,121721227000000,304899931000000,218163185000000,208457973000000,88117133000000,33604094000000,279604799000000,113193457000000,51633856000000,39907450000000,65105448000000,-33047763000000,-23991033000000
5,2022-03-31,439326959000000,124036040000000,315290919000000,232369082000000,206957877000000,90463701000000,33572339000000,77781498000000,30709441000000,14121409000000,18815052000000,10453069000000,-636511000000,-495346000000
6,2022-06-30,448040650000000,120133986000000,327906664000000,236287491000000,211753159000000,83362268000000,36771718000000,154985105000000,61643300000000,28218454000000,22423433000000,24589135000000,-19929267000000,-6457643000000
7,2022-09-30,470278409000000,125371520000000,344906889000000,250880637000000,219397772000000,85285669000000,40085851000000,231766785000000,90352743000000,39070499000000,31812631000000,43568446000000,-27952827000000,-14684755000000
8,2022-12-31,448424507000000,93674903000000,354749604000000,218470581000000,229953926000000,78344852000000,15330051000000,302231360000000,112189590000000,43376630000000,55654077000000,62181346000000,-31602804000000,-19390049000000
9,2023-03-31,479103881232172,119304465232172,359799416000000,966806340408729,-487702459176557,157527623748813,-38223158516641,63745371000000,17738278000000,640178000000,1574600000000,6291774000000,16471470000000,-979799000000


In [38]:
samsung_stock_history.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change
0,2021-01-04,81000,84400,80200,83000,38655276,0.024691
1,2021-01-05,81600,83900,81600,83900,35335669,0.010843
2,2021-01-06,83300,84500,82100,82200,42089013,-0.020262
3,2021-01-07,82800,84200,82700,82900,32644642,0.008516
4,2021-01-08,83300,90000,83000,88800,59013307,0.07117


In [40]:
# prompt: stock의 Index를 참고하여 재무제표기준일 컬럼을 추가해줘.
# 재무제표기준일 컬럼은 분기 기준으로 값이 부여되어야해.
# 2020-12-31, 2021-03-31, 2021-06-30, 2021-09-30, 2021-12-31, 2022-03-31..2024-03-31 처럼 말이야
# 만약 stock의 Index가 2021-02-28 이라면 해당 날짜의 바로 이전 분기인 2020-12-31 값을 부여해줘
# merge한 후의 rows 수와 merge 전의 samsung_stock_history의 rows 수는 같아야해

# 재무제표 기준일 컬럼 생성
samsung_stock_history[FS_DAY] = samsung_stock_history.Date.apply(lambda x: pd.to_datetime(str(x.year - 1) + '-12-31') if x.month <= 3 else
                                                                               pd.to_datetime(str(x.year) + '-03-31') if x.month <= 6 else
                                                                               pd.to_datetime(str(x.year) + '-06-30') if x.month <= 9 else
                                                                               pd.to_datetime(str(x.year) + '-09-30'))

# 주가 데이터와 재무제표 데이터 merge
df_merged = pd.merge(samsung_stock_history, df, on=FS_DAY, how='left')

# merge 전후 행 수 확인
print("Merge 전 samsung_stock_history 행 수:", len(samsung_stock_history))
print("Merge 후 df_merged 행 수:", len(df_merged))


Merge 전 samsung_stock_history 행 수: 879
Merge 후 df_merged 행 수: 879


In [41]:
df_merged

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change,재무제표기준일,자산총계,부채총계,...,비유동자산,유동부채,비유동부채,영업수익,매출총이익,영업이익,당기순이익(손실),영업활동현금흐름,투자활동현금흐름,재무활동현금흐름
0,2021-01-04,81000,84400,80200,83000,38655276,0.024691,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
1,2021-01-05,81600,83900,81600,83900,35335669,0.010843,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
2,2021-01-06,83300,84500,82100,82200,42089013,-0.020262,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
3,2021-01-07,82800,84200,82700,82900,32644642,0.008516,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
4,2021-01-08,83300,90000,83000,88800,59013307,0.071170,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,2024-07-19,85600,86100,84100,84400,18569122,-0.028769,2024-06-30,,,...,,,,,,,,,,
875,2024-07-22,84400,84900,82600,83000,18987560,-0.016588,2024-06-30,,,...,,,,,,,,,,
876,2024-07-23,84200,84700,83400,83900,15766389,0.010843,2024-06-30,,,...,,,,,,,,,,
877,2024-07-24,82900,83300,81900,82000,16939083,-0.022646,2024-06-30,,,...,,,,,,,,,,


In [43]:
df_merged.tail(30)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change,재무제표기준일,자산총계,부채총계,...,비유동자산,유동부채,비유동부채,영업수익,매출총이익,영업이익,당기순이익(손실),영업활동현금흐름,투자활동현금흐름,재무활동현금흐름
849,2024-06-14,79700,80500,79000,79600,22926612,0.012723,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
850,2024-06-17,79200,79500,78000,78100,16123051,-0.018844,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
851,2024-06-18,78700,80200,78600,79800,18207598,0.021767,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
852,2024-06-19,81100,82500,80500,81200,24168863,0.017544,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
853,2024-06-20,81500,82200,81200,81600,20288913,0.004926,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
854,2024-06-21,80700,80800,80000,80000,17907523,-0.019608,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
855,2024-06-24,79700,80900,79500,80600,15454227,0.0075,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
856,2024-06-25,80600,81800,80100,80800,19088458,0.002481,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
857,2024-06-26,80100,81400,79900,81300,17783242,0.006188,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0
858,2024-06-27,81300,81600,80500,81600,11739720,0.00369,2024-03-31,470899812000000.0,98983688000000.0,...,262355532000000.0,81770355000000.0,17213333000000.0,71915601000000.0,26029269000000.0,6606009000000.0,6754708000000.0,11866306000000.0,-22010751000000.0,1261662000000.0


### 재무제표 부여가 불가능한 주식 데이터 row 삭제

In [44]:
print("Drop 전 df_merged 행 수:", len(df_merged))
df_merged = df_merged.dropna()
print("Drop 후 df_merged 행 수:", len(df_merged))

Drop 전 df_merged 행 수: 879
Drop 후 df_merged 행 수: 860


### merge한 데이터 저장 및 불러와보기

In [45]:
df_merged

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change,재무제표기준일,자산총계,부채총계,...,비유동자산,유동부채,비유동부채,영업수익,매출총이익,영업이익,당기순이익(손실),영업활동현금흐름,투자활동현금흐름,재무활동현금흐름
0,2021-01-04,81000,84400,80200,83000,38655276,0.024691,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
1,2021-01-05,81600,83900,81600,83900,35335669,0.010843,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
2,2021-01-06,83300,84500,82100,82200,42089013,-0.020262,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
3,2021-01-07,82800,84200,82700,82900,32644642,0.008516,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
4,2021-01-08,83300,90000,83000,88800,59013307,0.071170,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,2024-06-24,79700,80900,79500,80600,15454227,0.007500,2024-03-31,470899812000000,98983688000000,...,262355532000000,81770355000000,17213333000000,71915601000000,26029269000000,6606009000000,6.754708e+12,11866306000000,-22010751000000,1261662000000
856,2024-06-25,80600,81800,80100,80800,19088458,0.002481,2024-03-31,470899812000000,98983688000000,...,262355532000000,81770355000000,17213333000000,71915601000000,26029269000000,6606009000000,6.754708e+12,11866306000000,-22010751000000,1261662000000
857,2024-06-26,80100,81400,79900,81300,17783242,0.006188,2024-03-31,470899812000000,98983688000000,...,262355532000000,81770355000000,17213333000000,71915601000000,26029269000000,6606009000000,6.754708e+12,11866306000000,-22010751000000,1261662000000
858,2024-06-27,81300,81600,80500,81600,11739720,0.003690,2024-03-31,470899812000000,98983688000000,...,262355532000000,81770355000000,17213333000000,71915601000000,26029269000000,6606009000000,6.754708e+12,11866306000000,-22010751000000,1261662000000


In [46]:
stock_path = os.path.join(path, f"{stock_code}_{start_yr}.csv")
df_merged.to_csv(stock_path, index=False)

In [47]:
pd.read_csv(stock_path)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change,재무제표기준일,자산총계,부채총계,...,비유동자산,유동부채,비유동부채,영업수익,매출총이익,영업이익,당기순이익(손실),영업활동현금흐름,투자활동현금흐름,재무활동현금흐름
0,2021-01-04,81000,84400,80200,83000,38655276,0.024691,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
1,2021-01-05,81600,83900,81600,83900,35335669,0.010843,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
2,2021-01-06,83300,84500,82100,82200,42089013,-0.020262,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
3,2021-01-07,82800,84200,82700,82900,32644642,0.008516,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
4,2021-01-08,83300,90000,83000,88800,59013307,0.071170,2020-12-31,378235718000000,102287702000000,...,180020139000000,75604351000000,26683351000000,236806988000000,92318692000000,35993876000000,2.640783e+13,65287009000000,-53628591000000,-8327839000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,2024-06-24,79700,80900,79500,80600,15454227,0.007500,2024-03-31,470899812000000,98983688000000,...,262355532000000,81770355000000,17213333000000,71915601000000,26029269000000,6606009000000,6.754708e+12,11866306000000,-22010751000000,1261662000000
856,2024-06-25,80600,81800,80100,80800,19088458,0.002481,2024-03-31,470899812000000,98983688000000,...,262355532000000,81770355000000,17213333000000,71915601000000,26029269000000,6606009000000,6.754708e+12,11866306000000,-22010751000000,1261662000000
857,2024-06-26,80100,81400,79900,81300,17783242,0.006188,2024-03-31,470899812000000,98983688000000,...,262355532000000,81770355000000,17213333000000,71915601000000,26029269000000,6606009000000,6.754708e+12,11866306000000,-22010751000000,1261662000000
858,2024-06-27,81300,81600,80500,81600,11739720,0.003690,2024-03-31,470899812000000,98983688000000,...,262355532000000,81770355000000,17213333000000,71915601000000,26029269000000,6606009000000,6.754708e+12,11866306000000,-22010751000000,1261662000000
