# 뉴스 데이터를 활용한 주가 예측

In [2]:
# 라이브러리 적재
from IPython.display import display

# pandas_datareader는 코스피 주가 확인에 필요.
# 웹 상의 데이터를 DataFrame 객체로 만드는 기능 제공
import pandas_datareader as wb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random as rnd
import mglearn
import seaborn as sns
import datetime

plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [7]:
pip install pandas_datareader

Note: you may need to restart the kernel to use updated packages.


In [36]:
pd.set_option('precision', 4)

# 코스피지수 크롤링
start = datetime.datetime(2022, 1, 1)
end = datetime.datetime(2022, 4, 22)
df_null = wb.DataReader("^KS11","yahoo",start,end)     # ^KS11 : 코스피
df = df_null.dropna()                                  # 결측치 제거

# 새로운 칼럼 생성
# (Price : 당일 대비 다음날 주가가 상승했으면 1, 하락했으면 0 표시)
df['Price'] = 0
for i in range(0, 50):
    if df['Close'][i] < df['Close'][i+1]:
        df['Price'][i] = 1
    else:
        df['Price'][i] = 0

# 파일 저장
df.to_csv('kospi_주가데이터.csv')
df

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-04,2995.2500,2973.0801,2991.9700,2989.2400,621200,2989.2400,0
2022-01-05,2986.2000,2936.7300,2984.0500,2953.9700,786900,2953.9700,0
2022-01-06,2952.5400,2915.3799,2925.3999,2920.5300,785500,2920.5300,1
2022-01-07,2959.0300,2933.1001,2933.7800,2954.8899,545800,2954.8899,0
2022-01-10,2951.1201,2910.8999,2947.3701,2926.7200,477000,2926.7200,1
...,...,...,...,...,...,...,...
2022-04-18,2701.1101,2681.3701,2685.0400,2693.2100,1024600,2693.2100,0
2022-04-19,2723.9800,2705.3201,2707.7600,2718.8899,1256100,2718.8899,0
2022-04-20,2724.4600,2702.8401,2718.4900,2718.6899,1735700,2718.6899,0
2022-04-21,2737.5400,2725.0400,2725.7100,2728.2100,1002600,2728.2100,0


In [4]:
price_data = pd.read_csv('kospi_주가데이터.csv')
df_0 = price_data[price_data['Price']==0]['Date']
date_0 = []
for i in range(0,len(df_0)):
    date_0.append(str(df_0.tolist()[i])[:10].replace('-',''))

In [5]:
df_1 = price_data[price_data['Price']==1]['Date']
date_1 = []
for i in range(0,len(df_1)):
    date_1.append(str(df_1.tolist()[i])[:10].replace('-',''))

In [6]:
# 라이브러리 적재
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import parse

In [7]:
# 팍스넷 뉴스를 통한 데이터 크롤링
result_list = []
error_cnt = 0

def paxnet_news_title(dates):
    base_url = 'http://www.paxnet.co.kr/news/much?newsSetId=4667&currentPageNo={}&genDate={}&objId=N4667'
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
    }
    
    for date in dates:
        for page in range(1, 3):
            url = base_url.format(page, date)
            res = requests.get(url, headers=headers)
            if res.status_code == 200:
                soup = BeautifulSoup(res.text)
                title_list = soup.select('ul.thumb-list li')
                for title in title_list:
                    try:
                        news_title = title.select_one('dl.text > dt').text.strip()
                        result_list.append([news_title])
                    except:
                        error_cnt += 1

In [25]:
paxnet_news_title(date_0)
title_df_0 = pd.DataFrame(result_list, columns=['뉴스제목'])
title_df_0['주가변동'] = 0

In [26]:
result_list = []
error_cnt = 0

paxnet_news_title(date_1)
title_df_1 = pd.DataFrame(result_list, columns=['뉴스제목'])
title_df_1['주가변동'] = 1

In [27]:
title_df = pd.concat([title_df_0, title_df_1])
title_df.to_csv('팍스넷_뉴스타이틀.csv', index=False, encoding='utf-8')
title_df

Unnamed: 0,뉴스제목,주가변동
0,글로벌 큰손들 '검은 연기 내뿜는 기업' 투자 외면,0
1,"[특징주]한국바이오젠, 日 수출규제 속 국내 유일 실리콘 합성기술 보유 부각…공모가 상회",0
2,한국증시도 실망… 변동성 커졌다 [美 기준금리 0.25%P 인하],0
3,"[특징주] 첨생법, 국회 법사위 통과…관련株 강세",0
4,"코스피, 외인·기관 동반 순매도에 2010선 위협… 코스닥 620선",0
...,...,...
2915,[e공시 눈에 띄네]코스피-23일,1
2916,오전장 특징주★(코스피),1
2917,이 종목 매수포인트를 노려라!,1
2918,[뉴욕증시] '우한 폐렴' 공포에 1% 넘게 하락…다우 올해 하락 반전,1


In [30]:
# 네이버 뉴스를 통한 데이터 크롤링
result_list = []
error_cnt = 0

def naver_news_title(dates):
    base_url = 'https://finance.naver.com/news/news_list.naver?mode=LSS3D&section_id=101&section_id2=258&section_id3=401'
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
    }
    
    for date in dates:
        url = base_url.format(date)
        res = requests.get(url, headers=headers)
        if res.status_code == 200:
            soup = BeautifulSoup(res.text)
            title_list = soup.select('ul.newsList li')
            for title in title_list:
                try:
                    news_title = title.select_one('dd.articleSubject > dd').text.strip()
                    result_list.append([news_title])
                except:
                    error_cnt += 1

In [31]:
# 뉴스 타이틀에 따른 주가변동 확인
naver_news_title(date_0)
title_df_2 = pd.DataFrame(result_list, columns=['뉴스제목'])
title_df_2['주가변동'] = 0

In [32]:
result_list = []
error_cnt = 0

naver_news_title(date_1)
title_df_3 = pd.DataFrame(result_list, columns=['뉴스제목'])
title_df_3['주가변동'] = 1

In [33]:
title_df2 = pd.concat([title_df_2, title_df_3])
title_df2.to_csv('네이버_뉴스타이틀.csv', index=False, encoding='utf-8')
title_df2

Unnamed: 0,뉴스제목,주가변동


In [34]:
all_title = pd.concat([title_df, title_df2])
all_title.to_csv('팍스넷&네이버_뉴스타이틀.csv')