# Environment setting

In [2]:
import codecs
import json

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

import pandas as pd
import numpy as np

import warnings
import time


import jieba
from gensim.models import word2vec
from gensim.models import FastText
from glove import Glove, Corpus

warnings.filterwarnings("ignore")
%matplotlib

stopwords = [line.strip() for line in codecs.open(
    r'./stopwords.txt', 'r', 'utf-8').readlines()]

Using matplotlib backend: Qt5Agg


In [3]:
text = pd.read_csv(r'../../Data/TRAINSET_NEWS.csv')
stock = pd.read_csv(r'../../Data/TRAINSET_STOCK.csv')

# Prepare target vector

In [4]:
stock.head()

Unnamed: 0,ts_code,trade_date,name,open,low,high,close,change,pct_change,vol,amount,pe,pb,y
0,801010,20140401,农林牧渔,1668.75,1668.54,1689.12,1689.07,22.13,1.33,34914.0,291113.0,41.51,2.77,1
1,801010,20140402,农林牧渔,1688.72,1684.53,1693.41,1692.24,3.17,0.19,36300.0,289020.0,41.63,2.79,1
2,801010,20140403,农林牧渔,1693.05,1679.85,1697.73,1685.71,-6.53,-0.39,31403.0,259464.0,41.38,2.78,0
3,801010,20140404,农林牧渔,1681.92,1680.34,1698.44,1698.25,12.54,0.74,28648.0,240940.0,41.76,2.8,1
4,801010,20140408,农林牧渔,1693.24,1692.22,1706.84,1706.84,8.59,0.51,35012.0,312423.0,42.0,2.79,1


In [5]:
target_df = (stock.loc[(stock['y'] == 1)]).groupby(['trade_date', 'ts_code'])[
    'y'].count().unstack().fillna(0).astype(int)

In [6]:
print(target_df.shape)
target = target_df.values
target

(1064, 34)


array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

# Load word embedding

## Load word2vec

In [27]:
w2vmodel = word2vec.Word2Vec.load("./Word_Embedding_Model/w2v.model")

## Load GloVe

In [29]:
corpus_model = Corpus.load('./Word_Embedding_Model/glov_corpus.model')
glomodel = Glove.load('./Word_Embedding_Model/glove.model')

## Load Fasttext

In [30]:
fftmodel = FastText.load(r'./Word_Embedding_Model/fasttext.model')

## Combine vector

In [32]:
glove.word_vectors.shape

(126565, 100)

In [34]:
len(fttmodel.wv.vocab)

126565

# Merge news title from the same date

In [7]:
join_title = text[['date', 'title']]
join_title['title'] = join_title['title'].astype(str)
join_title = join_title.groupby(
    ['date'])['title'].apply(';'.join).reset_index()
join_title.head()

Unnamed: 0,date,title
0,20140414,习近平在空军机关调研时强调 加快建设一支空天一体攻防兼备的强大人民空军 为实现中国梦强军梦提...
1,20140415,医生贾永青：传递爱和感动;国际联播快讯;搜寻MH370航班：“蓝鳍金枪鱼”首次下水暂无发现;...
2,20140416,国际联播快讯;关注乌克兰局势：乌军队在东部地区开展强力行动;载有四百多人的韩国客轮进水下沉：...
3,20140417,【凡人善举】广西：市民见义勇为 巧施妙计擒劫匪;国际联播快讯;关注乌克兰局势;国内联播快讯;...
4,20140418,国际联播快讯;俄罗斯总统与俄民众“直接对话” 普京：乌境内绝无俄罗斯军队;关注乌克兰局势：四...


# Merge news content from the same date

In [8]:
join_content = text[['date', 'content']]
join_content['content'] = join_content['content'].astype(str)
join_content = join_content.groupby(
    ['date'])['content'].apply(';'.join).reset_index()
join_content.head()

Unnamed: 0,date,content
0,20140414,中共中央总书记、国家主席、中央军委主席习近平14日专程到空军机关就空军建设和军事斗争准备进行...
1,20140415,这几天我台走基层节目连续报道了河北定州人民医院32岁女医生贾永青身患癌症却隐瞒病情、仍然带病...
2,20140416,约旦驻利比亚大使遭绑架利比亚外交部15号证实，约旦驻利比亚大使法瓦兹·埃坦当天上午在利比亚首...
3,20140417,前两天，在广西北海，一位市民在目睹了一起抢夺案后，没有选择离开，而是机智地与民警一起将歹徒抓...
4,20140418,伊朗举行建军节阅兵式18号，伊朗在首都德黑兰南郊的霍梅尼陵举行一年一度的建军节阅兵式。伊朗总...


# Contact title and content

In [9]:
joined_text = join_title.merge(join_content, on='date')
joined_text.head()

Unnamed: 0,date,title,content
0,20140414,习近平在空军机关调研时强调 加快建设一支空天一体攻防兼备的强大人民空军 为实现中国梦强军梦提...,中共中央总书记、国家主席、中央军委主席习近平14日专程到空军机关就空军建设和军事斗争准备进行...
1,20140415,医生贾永青：传递爱和感动;国际联播快讯;搜寻MH370航班：“蓝鳍金枪鱼”首次下水暂无发现;...,这几天我台走基层节目连续报道了河北定州人民医院32岁女医生贾永青身患癌症却隐瞒病情、仍然带病...
2,20140416,国际联播快讯;关注乌克兰局势：乌军队在东部地区开展强力行动;载有四百多人的韩国客轮进水下沉：...,约旦驻利比亚大使遭绑架利比亚外交部15号证实，约旦驻利比亚大使法瓦兹·埃坦当天上午在利比亚首...
3,20140417,【凡人善举】广西：市民见义勇为 巧施妙计擒劫匪;国际联播快讯;关注乌克兰局势;国内联播快讯;...,前两天，在广西北海，一位市民在目睹了一起抢夺案后，没有选择离开，而是机智地与民警一起将歹徒抓...
4,20140418,国际联播快讯;俄罗斯总统与俄民众“直接对话” 普京：乌境内绝无俄罗斯军队;关注乌克兰局势：四...,伊朗举行建军节阅兵式18号，伊朗在首都德黑兰南郊的霍梅尼陵举行一年一度的建军节阅兵式。伊朗总...


# Rolling text

In [14]:
def func1(x):
    print(x)
    x = list(x)
    x.reverse()
    return ';'.join(x)

In [63]:
joined_text['title'].rolling(window=5)

Rolling [window=5,center=False,axis=0]