In [3]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from gensim.models import Word2Vec
from scipy.spatial import distance
from gluonnlp.data import SentencepieceTokenizer
from kobert.utils import get_tokenizer
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
from urllib.parse import quote
import requests
import json
import matplotlib.pyplot as plt
%matplotlib inline
import re
import datetime
from datetime import timedelta

# Train and fit model to news data

In [15]:
def train_fit(data, date, standard=0.93):
    theme_vectors_df = pd.read_csv('theme_vectors_xnorm.csv',index_col=0)
    theme_vectors_df['vectors'] = theme_vectors_df['vectors'].apply(lambda x: eval(x))
    theme_vectors_df['vectors'] = theme_vectors_df['vectors'].apply(lambda x: np.array(x))
    
    # Vectorization parameters
    tok_path = get_tokenizer()
    sp  = SentencepieceTokenizer(tok_path)
    v_dimension = 300
    v_window = 8
    model = Word2Vec.load('word2vec.model')
    
    # Get news vectors without normalization
    def vectorize_without_normal(news):
        # Remove letters which are not Hangul
        hangul = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣]+")
        news_words = hangul.sub(' ', news)
        token = sp(news)
        final_tokens = token
        init_v = np.array([0.0]*v_dimension)
        for word in final_tokens:
            word_vectors = model.wv
            if word in word_vectors.vocab:
                v = model.wv[word]
                init_v = init_v + v
        return init_v
    
    # Apply vectorization and return cosine simmilarity
    def apply_model(news):
        news_vec = vectorize_without_normal(news)
        result = []
        # Calculate cosine simmilarity
        for theme in theme_vectors_df['vectors']:
            cosine = 1 - distance.cosine(theme, news_vec)
            result.append(cosine)
        df = pd.DataFrame(data=np.zeros([168,2]), columns=['Theme', 'Result'])
        df['Theme'] = theme_vectors_df['themes']
        df['Result'] = result

        df.sort_values('Result', ascending=False, inplace=True)
        return df

    
    #theme_lst = []
    #simil_lst = []
    theme_simil = {}
    for i in tqdm(range(len(real_test))):
        result = apply_model(data['news'][i])
        result = result.sort_values(by='Result', ascending=False).reset_index(drop=True)
        theme = result['Theme'][0]
        cosine = result['Result'][0]
        theme_simil[theme] = cosine
        #lst.append(theme)
        #lst2.append(cosine)
        
    #df = pd.DataFrame(list(zip(theme_lst, simil_lst)), columns = ['Theme', 'Similarity'])
    df = pd.DataFrame(theme_simil, columns = ['Theme', 'Similarity']) 
    df = pd.concat([data,df],axis=1)
    df_filtered = df[df['Similarity']>=standard].reset_index(drop=True)
    df_filtered = df_filtered[['Theme','news']]
    
    # Filter news whose frequency is more than 0 less than 6
    df_final = df_filtered.groupby('Theme', as_index=False).count().sort_values(by='news',ascending=False)
    df_final = df_final[(df_final['news']>=1) & (df_final['news']<6)].reset_index(drop=True)
    print(date, df_final)
    return df_final

In [None]:
data = pd.read_csv("../Data/data_for_use/20200629_news_data.csv",index_col=0)
date = '2020-06-29'
train_fit(data, date)

# Make Portfolio

In [13]:
def make_portfolio(theme, test_date):
    # Load Kospi and Kosdaq list
    kospi = pd.read_csv("../Data/stock_data/kospi.csv", encoding='cp949')
    kosdaq = pd.read_csv("../Data/stock_data/kosdaq.csv", encoding='cp949')
    
    # Find target companies which match with theme from theme-comapny list and return compnanies with codes
    def target_com(theme, theme_list):
        company_list = theme_list[theme_list['Theme']==theme]['Company'].iloc[0]
        company_list = str(company_list).replace('[','').replace(']','').replace("\'",'').replace(' ', '').replace('‘','').replace('’','')
        comapny_list = company_list.split(',')
        
        # Find comapny code of target companies
        target_theme = {}
        for target in company_list:
            try:
                code = kosdaq[kosdaq['기업명']==target].iloc[0]['종목코드']
                code = str(code).zfill(6)
                target_theme[code] = target
            except:
                try:
                    code = kospi[kospi['기업명']==target].iloc[0]['종목코드']
                    code = str(code).zfill(6)
                    target_theme[code] = target
                except: print('Theme list error occurred!')

        return target_theme, company_list

    
    def target_(target_comapny):
        target_theme = {}
        for target in target_company:
            try:
                code = kosdaq[kosdaq['기업명']==target].iloc[0]['종목코드']
                code = str(code).zfill(6)
            except:
                code = kospi[kospi['기업명']==target].iloc[0]['종목코드']
                code = str(code).zfill(6)
            target_theme[code] = target

        return target_theme

    
    # Convert the input date as 5 business days before the date(start_date) and 1 day before the date(end_date)
    def date_convert(date):
        end_date = date

        def date_by_deducting_business_days(from_date):
            business_days_to_deduct = 6
            current_date = from_date
            while business_days_to_deduct > 0:
                current_date += datetime.timedelta(days=-1)
                weekday = current_date.weekday()
                if weekday >= 5: # sunday = 6
                    continue
                business_days_to_deduct -= 1
            return current_date

        convert_end_date = datetime.datetime.strptime(date,'%Y-%m-%d').date()
        start_date=datetime.date.strftime(date_by_deducting_business_days(convert_end_date),'%Y-%m-%d')

        return start_date, end_date

    # Date format
    def date_format(d):
        d = str(d).replace('-', '.')
        yyyy = int(d.split('.')[0])
        mm = int(d.split('.')[1])
        dd = int(d.split('.')[2])

        this_date = dt.date(yyyy, mm, dd)
        return this_date
    def date_format(d=''):
        if d != '':
            this_date = pd.to_datetime(d).date()
        else:
            this_date = pd.Timestamp.today().date()
        return (this_date)


    # Get the information of stock from NAVER
    def stock_info(stock_cd):
        url_float = 'http://companyinfo.stock.naver.com/v1/company/c1010001.aspx?cmp_cd=' + stock_cd
        source = urlopen(url_float).read()
        soup = bs(source, 'lxml')

        tmp = soup.find(id='cTB11').find_all('tr')[6].td.text
        tmp = tmp.replace('\r', '').replace('\n','').replace('\t','')
        tmp = re.split('/', tmp)

        outstanding = tmp[0].replace(',','').replace('주','').replace(' ','')
        outstanding = int(outstanding)

        floating = tmp[1].replace(' ','').replace('%','')
        floating = float(floating)

        name = soup.find(id='pArea').find('div').find('div').find('tr').find('td').find('span').text

        stock_outstanding[stock_cd] = outstanding
        stock_floating[stock_cd] = floating
        stock_name[stock_cd] = name

        return oustanding, floating, name


    # Get the historical price of index from NAVER
    def historical_index_naver(index_cd, start_date='', end_date='', page_n=1, last_page=0):
        if start_date:
            start_date = date_format(start_date)
        else:
            start_date = datetime.date.today()
        if end_date:
            end_date = date_format(end_date)+datetime.timedelta(days=-1)
        else:
            end_date = datetime.date.today()

        naver_index = 'http://finance.naver.com/sise/sise_index_day.nhn?code=' + index_cd + '&page=' + str(page_n)

        source = urlopen(naver_index).read()
        source = bs(source, 'lxml')

        dates = source.find_all('td', class_='date')
        prices = source.find_all('td', class_='number_1')
        #historical_prices = dict()


        for n in range(len(dates)):
            if dates[n].text.split('.')[0].isdigit():

                # Handling date
                this_date = dates[n].text
                this_date = date_format(this_date)

                if this_date <= end_date and this_date >= start_date:

                    # Handling closing price
                    this_close = prices[n*4].text
                    this_close = this_close.replace(',','')
                    this_close = float(this_close)

                    historical_prices[this_date] = this_close

                elif this_date < start_date:
                    return historical_prices

        # Navigating page
        if last_page == 0:
            last_page = source.find('td', class_='pgRR').find('a')['href']

            last_page = last_page.split('&')[1]
            last_page = last_page.split('=')[1]
            last_page = int(last_page)

        if page_n < last_page:
            page_n += 1
            historical_index_naver(index_cd, start_date, end_date, page_n, last_page)

        return historical_prices

    
    # Get the historical price of stock from NAVER
    def historical_stock_naver(stock_cd, start_date='', end_date='', page_n=1, last_page=0):
        if start_date:
            start_date = date_format(start_date)
        else:
            start_date = datetime.date.today()
        if end_date:
            end_date = date_format(end_date)+datetime.timedelta(days=-1)
        else:
            end_date = datetime.date.today()

        naver_stock = 'http://finance.naver.com/item/sise_day.nhn?code=' + stock_cd + '&page=' + str(page_n)

        source = urlopen(naver_stock).read()
        #source = requests.get(naver_stock)
        source = bs(source, 'lxml')

        dates = source.find_all('span', class_='tah p10 gray03')
        prices = source.find_all('td', class_='num')
        #historical_prices = dict()

        for n in range(len(dates)):

            if len(dates) > 0:

                # Handling date
                this_date = dates[n].text
                this_date = date_format(this_date)

                if this_date <= end_date and this_date >= start_date:

                    # Handling closing price
                    this_close = prices[n*6].text
                    this_close = this_close.replace(',','')
                    this_close = float(this_close)

                    historical_prices[this_date] = this_close

                elif this_date < start_date:
                    return historical_prices


        # Navigating page
        if last_page == 0:
            last_page = source.find('td', class_='pgRR').find('a')['href']
            last_page = last_page.split('&')[1]
            last_page = last_page.split('=')[1]
            last_page = int(last_page)

        if page_n < last_page:
            page_n += 1
            historical_stock_naver(stock_cd, start_date, end_date, page_n, last_page)

        return historical_prices


    # Set threshold of variance to select stable stock
    def threshold(var):
        thres = 0.05
        if var < -1*thres or var > thres: return False
        else : return True


    def stock_get_info(target_theme):
        stock_outstanding = dict()
        stock_floating = dict()
        stock_name = dict()
        for stock_cd in target_theme.keys():
            stock_info(stock_cd)

        return stock_outstanding, stock_floating, stock_name

    
    start_date, end_date = date_convert(test_date)
    theme_list=pd.read_csv("../Data/data_for_use/theme_company.csv",index_col=0)
    target_theme, company_list = target_com(theme, theme_list)

    theme_historical_prices = dict()

    for stock_cd in target_theme.keys():
        historical_prices = dict()
        historical_stock_naver(stock_cd, start_date, end_date)
        theme_historical_prices[stock_cd] = historical_prices

    theme_historical_price = pd.DataFrame(theme_historical_prices)
    theme_historical_price.sort_index(axis=1, inplace=True)

    theme_historical_price = theme_historical_price.fillna(method='ffill')
    if theme_historical_price.isnull().values.any():
        theme_historical_price = theme_historical_price.fillna(method='bfill')

    theme_historical_price['kospi200'] = historical_index_naver('KPI200', start_date, end_date).values()
    
    df = theme_historical_price
    for i in range(len(df.index)-1):
        df.iloc[i] = (theme_historical_price.iloc[i] - theme_historical_price.iloc[i+1])/theme_historical_price.iloc[i]
    
    df.drop(index=[df.index[-1]], inplace=True)


    print('\n')
    print('오늘의 추천테마는 : {}'.format(theme))
    print('해당 테마의 추천 종목은 :')
    for i in range(len(company_list)):
        standard = 1
        try:
            for j in range(len(df.index)-1):
                standard *= threshold(df.iloc[:, i][j])

            if standard == 1 : 
                print('{} : {}'.format(df.columns[i], target_theme[df.columns[i]]))
        except:pass

In [11]:
make_portfolio('해저터널', '2020-06-29')



오늘의 추천테마는 : 해저터널
해당 테마의 추천 종목은 :
028100 : 동아지질
060370 : KT서브마린
