# Environment setting

In [13]:
import codecs
import json

from IPython.display import display
# import matplotlib.pyplot as plt
# import seaborn as sns
# import matplotlib as mpl

import pandas as pd
import numpy as np

from tqdm import tqdm
import warnings
import time

import ast
import jieba
from gensim.models import word2vec
from gensim.models import FastText
from glove import Glove, Corpus

warnings.filterwarnings("ignore")
# %matplotlib

stopwords = [line.strip() for line in codecs.open(
    r'./stopwords.txt', 'r', 'utf-8').readlines()]

In [14]:
text = pd.read_csv(r'../../Data/TRAINSET_NEWS.csv')
stock = pd.read_csv(r'../../Data/TRAINSET_STOCK.csv')

# Prepare target vector

In [15]:
stock.head()

Unnamed: 0,ts_code,trade_date,name,open,low,high,close,change,pct_change,vol,amount,pe,pb,y
0,801010,20140401,农林牧渔,1668.75,1668.54,1689.12,1689.07,22.13,1.33,34914.0,291113.0,41.51,2.77,1
1,801010,20140402,农林牧渔,1688.72,1684.53,1693.41,1692.24,3.17,0.19,36300.0,289020.0,41.63,2.79,1
2,801010,20140403,农林牧渔,1693.05,1679.85,1697.73,1685.71,-6.53,-0.39,31403.0,259464.0,41.38,2.78,0
3,801010,20140404,农林牧渔,1681.92,1680.34,1698.44,1698.25,12.54,0.74,28648.0,240940.0,41.76,2.8,1
4,801010,20140408,农林牧渔,1693.24,1692.22,1706.84,1706.84,8.59,0.51,35012.0,312423.0,42.0,2.79,1


In [78]:
target_df = (stock.loc[(stock['y'] == 1)]).groupby(['trade_date', 'ts_code'])[
    'y'].count().unstack().fillna(0).astype(int).reset_index()
target_df['trade_date'] = pd.to_datetime(
    target_df['trade_date'], format='%Y%m%d', errors='ignore')
target_df.head()

ts_code,trade_date,801010,801020,801030,801040,801050,801080,801110,801120,801130,...,801730,801740,801750,801760,801770,801780,801790,801880,801890,802600
0,2014-04-01,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
1,2014-04-02,1,1,0,1,1,0,0,1,0,...,0,0,0,0,0,1,1,1,0,0
2,2014-04-03,0,0,1,0,1,1,1,0,0,...,1,0,1,1,1,0,1,0,0,1
3,2014-04-04,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,2014-04-08,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


# Load word embedding

## Load word2vec

In [17]:
w2vmodel = word2vec.Word2Vec.load("./Word_Embedding_Model/w2v.model")

## Load GloVe

In [18]:
corpus_model = Corpus.load('./Word_Embedding_Model/glov_corpus.model')
glove = Glove.load('./Word_Embedding_Model/glove.model')

## Load Fasttext

In [19]:
fttmodel = FastText.load(r'./Word_Embedding_Model/fasttext.model')

## Concatenate vector

In [20]:
glove.word_vectors.shape, len(fttmodel.wv.vocab), len(w2vmodel.wv.vocab)

((126565, 100), 126565, 126565)

In [21]:
# word_list = list(w2vmodel.wv.vocab.keys())
vector_size = 100
word_index = glove.dictionary
nb_words = len(word_index)

count = 0.5
embedding_word2vec_matrix = np.zeros((nb_words + 1, vector_size))
for word, i in tqdm(word_index.items()):
    embedding_vector = w2vmodel[word] if word in w2vmodel else None
    if embedding_vector is not None:
        count += 1
        embedding_word2vec_matrix[i] = embedding_vector
    else:
        unk_vec = np.random.random(victor_size) * 0.5
        unk_vec = unk_vec - unk_vec.mean()
        embedding_word2vec_matrix[i] = unk_vec


glove_count = 0
embedding_glove_matrix = np.zeros((nb_words + 1, vector_size))
for word, i in tqdm(word_index.items()):
    embedding_glove_vector = glove.word_vectors[glove.dictionary[word]
                                                ] if word in glove.dictionary else None
    if embedding_glove_vector is not None:
        glove_count += 1
        embedding_glove_matrix[i] = embedding_glove_vector
    else:
        unk_vec = np.random.random(victor_size) * 0.5
        unk_vec = unk_vec - unk_vec.mean()
        embedding_glove_matrix[i] = unk_vec


ftt_count = 0
embedding_ftt_matrix = np.zeros((nb_words + 1, vector_size))
for word, i in tqdm(word_index.items()):
    embedding_ftt_vector = fttmodel[word] if word in fttmodel else None
    if embedding_ftt_vector is not None:
        ftt_count += 1
        embedding_ftt_matrix[i] = embedding_ftt_vector
    else:
        unk_vec = np.random.random(victor_size) * 0.5
        unk_vec = unk_vec - unk_vec.mean()
        embedding_ftt_matrix[i] = unk_vec

embedding_matrix = np.concatenate(
    (embedding_word2vec_matrix, embedding_glove_matrix, embedding_ftt_matrix), axis=1)
print(embedding_matrix.shape, count * 1.0 /
      embedding_matrix.shape[0], glove_count*1.0/embedding_matrix.shape[0], ftt_count*1.0/embedding_matrix.shape[0])

100%|██████████| 126565/126565 [00:02<00:00, 50726.37it/s]
100%|██████████| 126565/126565 [00:00<00:00, 276137.13it/s]
100%|██████████| 126565/126565 [00:02<00:00, 46209.23it/s]


(126566, 300) 0.9999960494919646 0.9999920989839294 0.9999920989839294


# Load word segment

In [69]:
seg_text = pd.read_csv(r'./Word_Embedding_Model/seg_words.csv')
seg_text['date'] = pd.to_datetime(
    seg_text['date'], format='%Y%m%d', errors='ignore')
seg_text.head()

Unnamed: 0,date,title,content
0,2014-04-14,"['习近平', '空军', '机关', '调研', '时', '强调', '加快', '建设...","['中共中央', '总书记', '国家', '主席', '中央军委', '主席', '习近平..."
1,2014-04-14,"['利比亚', '临时政府', '总理', '辞职']","['本月', '8', '号', '刚刚', '正式', '任命', '利比亚', '临时政..."
2,2014-04-14,"['关注', '乌克兰', '局势']","['代行', '乌克兰', '总统', '职责', '乌克兰', '议长', '图尔', '..."
3,2014-04-14,"['国内', '联播', '快讯']","['低碳', '中国', '行', '正式', '启动', '国家', '发展', '改革'..."
4,2014-04-14,"['刘汉', '36', '人涉', '黑案', '继续', '开庭审理']","['刘汉', '刘维', '36', '人', '涉嫌', '犯', '组织', '领导',..."


In [23]:
def merge_func(x):
    x = ast.literal_eval(x)
    r_list = []
    for i in x:
        r_list += i
    return r_list

## Merge news title from the same date

In [70]:
join_title = seg_text[['date', 'title']]
join_title = join_title.groupby(
    ['date'])['title'].apply(','.join).reset_index()
join_title['title'] = join_title['title'].apply(merge_func)

## Merge news content from the same date

In [71]:
join_content = seg_text[['date', 'content']]
join_content = join_content.groupby(
    ['date'])['content'].apply(','.join).reset_index()
join_content['content'] = join_content['content'].apply(merge_func)

# Contact title and content

In [72]:
joined_text = join_title.merge(join_content, on='date')
joined_text.head()

Unnamed: 0,date,title,content
0,2014-04-14,"[习近平, 空军, 机关, 调研, 时, 强调, 加快, 建设, 一支, 空天, 一体, 攻...","[中共中央, 总书记, 国家, 主席, 中央军委, 主席, 习近平, 14, 日, 专程到,..."
1,2014-04-15,"[医生, 贾永青, 传递, 爱, 感动, 国际, 联播, 快讯, 搜寻, MH370, 航班...","[几天, 我台, 走, 基层, 节目, 连续, 报道, 河北, 定州, 人民, 医院, 32..."
2,2014-04-16,"[国际, 联播, 快讯, 关注, 乌克兰, 局势, 乌, 军队, 东部, 地区, 开展, 强...","[约旦, 驻, 利比亚, 大使, 遭绑架, 利比亚, 外交部, 15, 号, 证实, 约旦,..."
3,2014-04-17,"[凡人, 善举, 广西, 市民, 见义勇为, 巧施, 妙计, 擒, 劫匪, 国际, 联播, ...","[前两天, 广西北海, 一位, 市民, 目睹, 一起, 抢夺案, 后, 没有, 选择, 离开..."
4,2014-04-18,"[国际, 联播, 快讯, 俄罗斯, 总统, 俄, 民众, 直接对话, 普京, 乌, 境内, ...","[伊朗, 举行, 建军节, 阅兵式, 18, 号, 伊朗, 首都, 德黑兰, 南郊, 霍梅尼..."


# Padding size

In [27]:
title_size = int(np.percentile(joined_text['title'].str.len(), 95))
content_size = int(np.percentile(joined_text['content'].str.len(), 95))

# Replace word embedding

In [28]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# News 2 Sequence

In [29]:
# def create_emb_layer(weights_matrix, non_trainable=False):
#     num_embeddings, embedding_dim = weights_matrix.size()
#     emb_layer = nn.Embedding(num_embeddings, embedding_dim)
#     emb_layer.load_state_dict({'weight': weights_matrix})
#     if non_trainable:
#         emb_layer.weight.requires_grad = False

#     return emb_layer, num_embeddings, embedding_dim


# class GRU(nn.Module):
#     def __init__(self, weights_matrix, hidden_size, num_layers):
#         super(self).__init__()
#         self.embedding, num_embeddings, embedding_dim = create_emb_layer(
#             weights_matrix, True)
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.gru = nn.GRU(embedding_dim, hidden_size,
#                           num_layers, batch_first=True)

#     def forward(self, inp, hidden):
#         return self.gru(self.embedding(inp), hidden)

#     def init_hidden(self, batch_size):
#         return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))

In [34]:
token = Tokenizer()
token.word_index = word_index

In [132]:
title_sequences = token.texts_to_sequences(join_title['title'].values)
title_vec = pad_sequences(title_sequences, maxlen=title_size,
                          padding='post', truncating='post', value=nb_words)

content_sequences = token.texts_to_sequences(join_content['content'].values)
content_vec = pad_sequences(content_sequences, maxlen=content_size,
                            padding='post', truncating='post', value=nb_words)
title_vec.shape, content_vec.shape

((1814, 160), (1814, 3305))

# Rolling

In [156]:
train_roll_window = 10
target_roll_window = 5

In [76]:
target_min_date = target_df['trade_date'].min()
target_max_date = target_df['trade_date'].max()
target_min_date, target_max_date, target_df.shape

(Timestamp('2014-04-01 00:00:00'),
 Timestamp('2019-04-01 00:00:00'),
 (1064, 35))

In [74]:
text_min_date = joined_text['date'].min()
text_max_date = joined_text['date'].max()
text_min_date, text_max_date

(Timestamp('2014-04-14 00:00:00'), Timestamp('2019-04-01 00:00:00'))

In [114]:
begin_date = text_min_date + pd.DateOffset(10)
subdate = int((text_max_date-begin_date).days)
begin_date, subdate

(Timestamp('2014-04-24 00:00:00'), 1803)

## Target rolling

In [185]:
tmp_target = target_df.iloc[::-1]
limit_target_index = (tmp_target['trade_date'] >= '2014-04-25')
tmp_target = tmp_target[limit_target_index]

In [205]:
tmp_target.shape

(1049, 35)

In [198]:
rolling_date = []
target_matrix = np.zeros(
    (tmp_target.shape[0]-target_roll_window+1, 34*target_roll_window))
for i in tqdm(range(0, tmp_target.shape[0]-4)):
    tmp_i = tmp_target.iloc[i:i+5]
    rolling_date.append(list(tmp_i['trade_date']))
    target_matrix[i] = (tmp_i.drop(
        columns=['trade_date'], axis=1).values).reshape(1, -1)[0]

100%|██████████| 1045/1045 [00:02<00:00, 485.42it/s]


In [163]:
pre_begin_date = begin_date + pd.DateOffset(1)
pre_begin_date_index = target_df.loc[target_df['trade_date']
                                     == pre_begin_date].index

## Data rolling

In [212]:
train_title_matrix = np.zeros(
    (tmp_target.shape[0], title_size*train_roll_window*pre_roll_window))
train_content_matrix = np.zeros(
    (tmp_target.shape[0], content_size*train_roll_window*pre_roll_window))
train_title_matrix.shape, train_content_matrix.shape

((1049, 8000), (1049, 165250))

In [217]:
tmp_title_row.shape

(50, 160)

In [226]:
for i in tqdm(range(0, len(rolling_date))):
    tmp_title_row = np.zeros((pre_roll_window, title_size*train_roll_window))
    tmp_content_row = np.zeros(
        (pre_roll_windowe_roll_window, content_size*train_roll_window))

    for j in range(0, len(rolling_date[i])):
        index_range = (join_title['date'] >= (
            rolling_date[i][j] - pd.DateOffset(train_roll_window))) & (join_title['date'] < rolling_date[i][j])
        tmp_title_row[j] = title_vec[index_range].reshape(1, -1)[0]
        tmp_content_row[j] = content_vec[index_range].reshape(
            1, -1)[0]
    train_title_matrix[i] = tmp_title_row.reshape(1, -1)[0]
    train_content_matrix[i] = tmp_content_row.reshape(1, -1)[0]

100%|██████████| 1045/1045 [00:16<00:00, 42.78it/s]


In [227]:
train_title_matrix.shape, train_content_matrix.shape

((1049, 8000), (1049, 165250))