# Environment setting

In [1]:
import codecs
import json

from IPython.display import display
# import matplotlib.pyplot as plt
# import seaborn as sns
# import matplotlib as mpl

import pandas as pd
import numpy as np

from tqdm import tqdm
import warnings
import time

import ast
import jieba
from gensim.models import word2vec
from gensim.models import FastText
from glove import Glove, Corpus

warnings.filterwarnings("ignore")
# %matplotlib

stopwords = [line.strip() for line in codecs.open(
    r'./stopwords.txt', 'r', 'utf-8').readlines()]

In [2]:
text = pd.read_csv(r'../../Data/TRAINSET_NEWS.csv')
stock = pd.read_csv(r'../../Data/TRAINSET_STOCK.csv')

# Prepare target vector

In [3]:
stock.head()

Unnamed: 0,ts_code,trade_date,name,open,low,high,close,change,pct_change,vol,amount,pe,pb,y
0,801010,20140401,农林牧渔,1668.75,1668.54,1689.12,1689.07,22.13,1.33,34914.0,291113.0,41.51,2.77,1
1,801010,20140402,农林牧渔,1688.72,1684.53,1693.41,1692.24,3.17,0.19,36300.0,289020.0,41.63,2.79,1
2,801010,20140403,农林牧渔,1693.05,1679.85,1697.73,1685.71,-6.53,-0.39,31403.0,259464.0,41.38,2.78,0
3,801010,20140404,农林牧渔,1681.92,1680.34,1698.44,1698.25,12.54,0.74,28648.0,240940.0,41.76,2.8,1
4,801010,20140408,农林牧渔,1693.24,1692.22,1706.84,1706.84,8.59,0.51,35012.0,312423.0,42.0,2.79,1


In [4]:
target_df = (stock.loc[(stock['y'] == 1)]).groupby(['trade_date', 'ts_code'])[
    'y'].count().unstack().fillna(0).astype(int).reset_index()
target_df['trade_date'] = pd.to_datetime(
    target_df['trade_date'], format='%Y%m%d', errors='ignore')
target_df.head()

ts_code,trade_date,801010,801020,801030,801040,801050,801080,801110,801120,801130,...,801730,801740,801750,801760,801770,801780,801790,801880,801890,802600
0,2014-04-01,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
1,2014-04-02,1,1,0,1,1,0,0,1,0,...,0,0,0,0,0,1,1,1,0,0
2,2014-04-03,0,0,1,0,1,1,1,0,0,...,1,0,1,1,1,0,1,0,0,1
3,2014-04-04,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,2014-04-08,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


# Load word embedding

## Load word2vec

In [5]:
w2vmodel = word2vec.Word2Vec.load("./Word_Embedding_Model/w2v.model")

## Load GloVe

In [6]:
corpus_model = Corpus.load('./Word_Embedding_Model/glov_corpus.model')
glove = Glove.load('./Word_Embedding_Model/glove.model')

## Load Fasttext

In [7]:
fttmodel = FastText.load(r'./Word_Embedding_Model/fasttext.model')

## Concatenate vector

In [8]:
glove.word_vectors.shape, len(fttmodel.wv.vocab), len(w2vmodel.wv.vocab)

((126565, 100), 126565, 126565)

In [9]:
# word_list = list(w2vmodel.wv.vocab.keys())
vector_size = 100
word_index = glove.dictionary
nb_words = len(word_index)

count = 0.5
embedding_word2vec_matrix = np.zeros((nb_words + 1, vector_size))
for word, i in tqdm(word_index.items()):
    embedding_vector = w2vmodel[word] if word in w2vmodel else None
    if embedding_vector is not None:
        count += 1
        embedding_word2vec_matrix[i] = embedding_vector
    else:
        unk_vec = np.random.random(victor_size) * 0.5
        unk_vec = unk_vec - unk_vec.mean()
        embedding_word2vec_matrix[i] = unk_vec


glove_count = 0
embedding_glove_matrix = np.zeros((nb_words + 1, vector_size))
for word, i in tqdm(word_index.items()):
    embedding_glove_vector = glove.word_vectors[glove.dictionary[word]
                                                ] if word in glove.dictionary else None
    if embedding_glove_vector is not None:
        glove_count += 1
        embedding_glove_matrix[i] = embedding_glove_vector
    else:
        unk_vec = np.random.random(victor_size) * 0.5
        unk_vec = unk_vec - unk_vec.mean()
        embedding_glove_matrix[i] = unk_vec


ftt_count = 0
embedding_ftt_matrix = np.zeros((nb_words + 1, vector_size))
for word, i in tqdm(word_index.items()):
    embedding_ftt_vector = fttmodel[word] if word in fttmodel else None
    if embedding_ftt_vector is not None:
        ftt_count += 1
        embedding_ftt_matrix[i] = embedding_ftt_vector
    else:
        unk_vec = np.random.random(victor_size) * 0.5
        unk_vec = unk_vec - unk_vec.mean()
        embedding_ftt_matrix[i] = unk_vec

embedding_matrix = np.concatenate(
    (embedding_word2vec_matrix, embedding_glove_matrix, embedding_ftt_matrix), axis=1)
print(embedding_matrix.shape, count * 1.0 /
      embedding_matrix.shape[0], glove_count*1.0/embedding_matrix.shape[0], ftt_count*1.0/embedding_matrix.shape[0])

100%|██████████| 126565/126565 [00:02<00:00, 50339.61it/s]
100%|██████████| 126565/126565 [00:00<00:00, 281995.69it/s]
100%|██████████| 126565/126565 [00:02<00:00, 48256.63it/s]


(126566, 300) 0.9999960494919646 0.9999920989839294 0.9999920989839294


# Load word segment

In [10]:
seg_text = pd.read_csv(r'./Word_Embedding_Model/seg_words.csv')
seg_text['date'] = pd.to_datetime(
    seg_text['date'], format='%Y%m%d', errors='ignore')
seg_text.head()

Unnamed: 0,date,title,content
0,2014-04-14,"['习近平', '空军', '机关', '调研', '时', '强调', '加快', '建设...","['中共中央', '总书记', '国家', '主席', '中央军委', '主席', '习近平..."
1,2014-04-14,"['利比亚', '临时政府', '总理', '辞职']","['本月', '8', '号', '刚刚', '正式', '任命', '利比亚', '临时政..."
2,2014-04-14,"['关注', '乌克兰', '局势']","['代行', '乌克兰', '总统', '职责', '乌克兰', '议长', '图尔', '..."
3,2014-04-14,"['国内', '联播', '快讯']","['低碳', '中国', '行', '正式', '启动', '国家', '发展', '改革'..."
4,2014-04-14,"['刘汉', '36', '人涉', '黑案', '继续', '开庭审理']","['刘汉', '刘维', '36', '人', '涉嫌', '犯', '组织', '领导',..."


In [11]:
def merge_func(x):
    x = ast.literal_eval(x)
    r_list = []
    for i in x:
        r_list += i
    return r_list

## Merge news title from the same date

In [12]:
join_title = seg_text[['date', 'title']]
join_title = join_title.groupby(
    ['date'])['title'].apply(','.join).reset_index()
join_title['title'] = join_title['title'].apply(merge_func)

## Merge news content from the same date

In [13]:
join_content = seg_text[['date', 'content']]
join_content = join_content.groupby(
    ['date'])['content'].apply(','.join).reset_index()
join_content['content'] = join_content['content'].apply(merge_func)

# Contact title and content

In [14]:
joined_text = join_title.merge(join_content, on='date')
joined_text.head()

Unnamed: 0,date,title,content
0,2014-04-14,"[习近平, 空军, 机关, 调研, 时, 强调, 加快, 建设, 一支, 空天, 一体, 攻...","[中共中央, 总书记, 国家, 主席, 中央军委, 主席, 习近平, 14, 日, 专程到,..."
1,2014-04-15,"[医生, 贾永青, 传递, 爱, 感动, 国际, 联播, 快讯, 搜寻, MH370, 航班...","[几天, 我台, 走, 基层, 节目, 连续, 报道, 河北, 定州, 人民, 医院, 32..."
2,2014-04-16,"[国际, 联播, 快讯, 关注, 乌克兰, 局势, 乌, 军队, 东部, 地区, 开展, 强...","[约旦, 驻, 利比亚, 大使, 遭绑架, 利比亚, 外交部, 15, 号, 证实, 约旦,..."
3,2014-04-17,"[凡人, 善举, 广西, 市民, 见义勇为, 巧施, 妙计, 擒, 劫匪, 国际, 联播, ...","[前两天, 广西北海, 一位, 市民, 目睹, 一起, 抢夺案, 后, 没有, 选择, 离开..."
4,2014-04-18,"[国际, 联播, 快讯, 俄罗斯, 总统, 俄, 民众, 直接对话, 普京, 乌, 境内, ...","[伊朗, 举行, 建军节, 阅兵式, 18, 号, 伊朗, 首都, 德黑兰, 南郊, 霍梅尼..."


# Padding size

In [15]:
title_size = int(np.percentile(joined_text['title'].str.len(), 95))
content_size = int(np.percentile(joined_text['content'].str.len(), 95))

# Replace word embedding

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# News 2 Sequence

In [17]:
token = Tokenizer()
token.word_index = word_index

In [18]:
title_sequences = token.texts_to_sequences(join_title['title'].values)
title_vec = pad_sequences(title_sequences, maxlen=title_size,
                          padding='post', truncating='post', value=nb_words)

content_sequences = token.texts_to_sequences(join_content['content'].values)
content_vec = pad_sequences(content_sequences, maxlen=content_size,
                            padding='post', truncating='post', value=nb_words)
title_vec.shape, content_vec.shape

((1814, 160), (1814, 3305))

# Rolling

In [19]:
train_roll_window = 10
target_roll_window = 5

## Target rolling

In [20]:
tmp_target = target_df.iloc[::-1]
limit_target_index = (tmp_target['trade_date'] >= '2014-04-25')
tmp_target = tmp_target[limit_target_index]

In [21]:
tmp_target.shape

(1049, 35)

In [22]:
rolling_date = []
target_matrix = np.zeros(
    (tmp_target.shape[0]-target_roll_window+1, 34*target_roll_window))
for i in tqdm(range(0, tmp_target.shape[0]-4)):
    tmp_i = tmp_target.iloc[i:i+5]
    rolling_date.append(list(tmp_i['trade_date']))
    target_matrix[i] = (tmp_i.drop(
        columns=['trade_date'], axis=1).values).reshape(1, -1)[0]

100%|██████████| 1045/1045 [00:02<00:00, 369.63it/s]


In [23]:
target_matrix.shape

(1045, 170)

## Data rolling

In [24]:
train_title_matrix = np.zeros(
    (target_matrix.shape[0], title_size*train_roll_window*target_roll_window))
train_content_matrix = np.zeros(
    (target_matrix.shape[0], content_size*train_roll_window*target_roll_window))
train_title_matrix.shape, train_content_matrix.shape

((1045, 8000), (1045, 165250))

In [25]:
for i in tqdm(range(0, len(rolling_date))):
    tmp_title_row = np.zeros(
        (target_roll_window, title_size*train_roll_window))
    tmp_content_row = np.zeros(
        (target_roll_window, content_size*train_roll_window))

    for j in range(0, len(rolling_date[i])):
        index_range = (join_title['date'] >= (
            rolling_date[i][j] - pd.DateOffset(train_roll_window))) & (join_title['date'] < rolling_date[i][j])
        tmp_title_row[j] = title_vec[index_range].reshape(1, -1)[0]
        tmp_content_row[j] = content_vec[index_range].reshape(
            1, -1)[0]
    train_title_matrix[i] = tmp_title_row.reshape(1, -1)[0]
    train_content_matrix[i] = tmp_content_row.reshape(1, -1)[0]

100%|██████████| 1045/1045 [00:13<00:00, 78.50it/s]


In [26]:
train_title_matrix.shape, train_content_matrix.shape

((1045, 8000), (1045, 165250))

# Nerual network

In [27]:
import keras
from keras.utils import np_utils
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *

## BiLSTM

In [28]:
class nn:
    def __init__(self, data, labels, wordvocab, embeddingDim, padSize, rollingWin, prerollingWin, preEmbedding):
        self.dataset = data
        self.labels = labels
        self.wordvocab = wordvocab
        self.embeddingDim = embeddingDim
        self.padSize = padSize
        self.prerollingWin = prerollingWin
        self.rollingWin = rollingWin
        self.preEmbedding = preEmbedding
        self.model = None

    def build_model(self):
        vocabSize = len(self.wordvocab)
        embeddingDim = self.embeddingDim  # the vector size a word need to be converted
        maxlen = self.padSize*self.rollingWin * \
            self.prerollingWin  # the size of a sentence vector
        outputDims = 34*self.prerollingWin
        hiddenDims = 500

        train_X = self.dataset
        train_Y = self.labels

        print(train_X.shape)
        print(train_Y.shape)
        max_features = vocabSize + 1
        word_input = Input(shape=(maxlen,), dtype='float32', name='word_input')
        mask = Masking(mask_value=0.)(word_input)
        word_emb = Embedding(max_features, embeddingDim, weights=[self.preEmbedding],
                             input_length=maxlen, name='word_emb')(mask)
        bilstm1 = Bidirectional(
            LSTM(hiddenDims, return_sequences=True))(word_emb)
        bilstm2 = Bidirectional(
            LSTM(hiddenDims, return_sequences=True))(bilstm1)
        bilstm_d = Dropout(0.5)(bilstm2)
        output = Dense(outputDims, activation='softmax')(bilstm_d)
        model = Model(inputs=[word_input], outputs=output)
        #sgd = optimizers.SGD(lr=0.1, decay=1e-3)
        model.summary()
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'], )
        self.model = model

    def train(self):
        batchSize = 32
        result = self.model.fit(self.dataset, self.labels,
                                batch_size=batchSize, epochs=10)

    def save_model(self):
        self.model.save('model.h5')

In [29]:
lstm = nn(train_title_matrix, target_matrix, word_index, embeddingDim=300,
          padSize=title_size, rollingWin=train_roll_window, prerollingWin=pre_roll_window, preEmbedding=embedding_matrix)

NameError: name 'pre_roll_window' is not defined

In [None]:
lstm.build_model()

## BiGRU

In [None]:
def bi_gru_model(sent_length, embeddings_weight, output_num):
    print("get_text_gru3")
    content = Input(shape=(sent_length,), dtype='int32')
    embedding = Embedding(
        name="word_embedding",
        input_dim=embeddings_weight.shape[0],
        weights=[embeddings_weight],
        output_dim=embeddings_weight.shape[1],
        trainable=False)

    x = SpatialDropout1D(0.2)(embedding(content))

#     x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)
#     x = Bidirectional(CuDNNGRU(200, return_sequences=True))(x)

    x = Bidirectional(GRU(200, return_sequences=True))(x)
    x = Bidirectional(GRU(200, return_sequences=True))(x)

    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)

    conc = concatenate([avg_pool, max_pool])

    x = Dropout(0.2)(Activation(activation="relu")(
        BatchNormalization()(Dense(1000)(conc))))
    x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
    output = Dense(output_num, activation="softmax")(x)

    model = Model(inputs=content, outputs=output)
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
gru_model = bi_gru_model(sent_length=(title_size*train_roll_window*target_roll_window),
                         embeddings_weight=embedding_matrix, output_num=34*target_roll_window)

In [None]:
gru_model.fit(train_title_matrix, target_matrix)