In [1]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs

In [2]:
stock_list = []
BaseUrl = 'http://finance.naver.com/sise/entryJongmok.nhn?&page='

for i in range(1, 21):
    url = BaseUrl + str(i)
    r = requests.get(url)
    soup = bs(r.text, 'lxml')
    items = soup.find_all('td', {'class': 'ctg'})

    for item in items:
        #print(item)
        txt = item.a.get('href') # https://finance.naver.com/item/main.nhn?code=006390
        k = re.search('[\d]+', txt) ##정규표현식 사용. [\d] 숫자표현, + : 반복
        # print(k)
        if k:
            code = k.group()
            name = item.text
            data = code, name
            stock_list.append(data)
            # print(data)

# 코스피 상위 200 종목 이름 가져오기
stock_name = []            
for i in range(len(stock_list)):
    # print(stock_list[i][1])
    stock_name.append(stock_list[i][1])
    
stock_code = []
for i in range(len(stock_list)):
    # print(stock_list[i][0])
    stock_code.append(stock_list[i][0])
    
# print(stock_code)

In [3]:
df = pd.DataFrame({'stock_name': stock_name, 'stock_code':stock_code})

In [4]:
df[df['stock_name']=='두산로보틱스']

Unnamed: 0,stock_name,stock_code
65,두산로보틱스,454910


In [5]:
df = df.drop(65)

In [6]:
corp_dic = dict(zip(stock_code, stock_name))
del corp_dic['454910']

In [7]:
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [8]:
res = {}
for i in tqdm(corp_dic.keys()):

    URL = f"https://finance.naver.com/item/main.nhn?code={i}"

    req = requests.get(URL)
    html = req.text

    financial_stmt = pd.read_html(html)[3]

    financial_stmt.set_index(('주요재무정보', '주요재무정보', '주요재무정보'), inplace=True)
    financial_stmt.index.rename('주요재무정보', inplace=True)
    financial_stmt.columns = financial_stmt.columns.droplevel(2)
    result = financial_stmt['최근 분기 실적'].reset_index()
    result = result.set_index('주요재무정보')
    res[corp_dic[i]] = result

100%|██████████| 199/199 [00:30<00:00,  6.47it/s]


In [9]:
res['삼성전자']

Unnamed: 0_level_0,2023.03,2023.06,2023.09,2023.12,2024.03,2024.06(E)
주요재무정보,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
매출액,637454.0,600055.0,674047.0,677799.0,719156.0,736702.0
영업이익,6402.0,6685.0,24335.0,28247.0,66060.0,82055.0
당기순이익,15746.0,17236.0,58442.0,63448.0,67547.0,71768.0
영업이익률,1.0,1.11,3.61,4.17,9.19,11.14
순이익률,2.47,2.87,8.67,9.36,9.39,9.74
ROE(지배주주),13.71,10.66,9.27,4.15,5.53,
부채비율,26.21,24.8,24.89,25.36,26.61,
당좌비율,210.35,209.73,205.3,189.46,189.76,
유보율,38025.67,38184.87,38609.91,39114.28,39581.75,
EPS(원),206.0,228.0,810.0,887.0,975.0,1071.0


In [10]:
dic = {}
for k in res.keys():
    ls = []
    ls += (list(res[k].loc['PER(배)']))
    ls += (list(res[k].loc['ROE(지배주주)']))
    ls += (list(res[k].loc['부채비율']))
    ls += (list(res[k].loc['영업이익률']))
    ls += (list(res[k].loc['EPS(원)']))
    dic[k] = ls

In [11]:
df = pd.DataFrame(dic).T

In [12]:
t = ['per', 'roe', 'dr', 'opm', 'eps']
c = []
for i in t:
    for r in range(1,7):
        c.append(i + str(r))

In [13]:
df.columns = c

In [14]:
df = df.reset_index()

In [15]:
col = df.columns[1:]
for c in col:
    df[c] = df[c].replace('-', np.nan)
    df[c] = df[c].astype(float)

In [16]:
df.to_csv('target_fs.csv', index=False)

In [17]:
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'}
price_df = pd.DataFrame()
price_dic = {}
for code in tqdm(corp_dic.keys()):
    temp_df = pd.DataFrame()
    for page in range(6, 32):
        url = f'https://finance.naver.com/item/sise_day.naver?code={code}&page={page}'

        response = requests.get(url, headers=headers)
        html = bs(response.text, 'lxml')
        html_table = html.select('body > table.type2')
        table = pd.read_html(str(html_table))
        temp_df = pd.concat([temp_df, table[0].dropna()])
    temp_df['corp_name'] = corp_dic[code]
    price_dic[corp_dic[code]] = temp_df

100%|██████████| 199/199 [03:24<00:00,  1.03s/it]


In [18]:
price_df = pd.DataFrame()
for k in price_dic.keys():
    temp = price_dic[k].copy()
    temp['날짜'] = pd.to_datetime(temp['날짜'])
    temp = temp[(temp['날짜'] >= '2023-04-01') & (temp['날짜'] < '2024-04-01')].sort_values('날짜')
    price_df = pd.concat([price_df, temp.drop_duplicates()])
price_df.reset_index(drop=True)

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,corp_name
0,2023-04-03,63100.0,하락 900,64000.0,64000.0,63000.0,11973133.0,삼성전자
1,2023-04-04,63600.0,상승 500,63400.0,63800.0,62800.0,11120514.0,삼성전자
2,2023-04-05,63900.0,상승 300,63700.0,64000.0,63400.0,9176149.0,삼성전자
3,2023-04-06,62300.0,"하락 1,600",63500.0,63600.0,62300.0,14992747.0,삼성전자
4,2023-04-07,65000.0,"상승 2,700",63800.0,65200.0,63800.0,27476120.0,삼성전자
...,...,...,...,...,...,...,...,...
48358,2024-03-25,475000.0,"하락 8,000",483000.0,487500.0,475000.0,1168.0,영풍
48359,2024-03-26,476500.0,"상승 1,500",475000.0,481500.0,475000.0,1019.0,영풍
48360,2024-03-27,479500.0,"상승 3,000",473000.0,482000.0,473000.0,683.0,영풍
48361,2024-03-28,477000.0,"하락 2,500",483000.0,483000.0,475000.0,597.0,영풍


In [19]:
price_df.to_csv('price_df.csv', index=False)