# Environment setting

In [1]:
import codecs
import json

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

import pandas as pd
import numpy as np

from tqdm import tqdm
import warnings
import time


import jieba
from gensim.models import word2vec
from gensim.models import FastText
from glove import Glove, Corpus

warnings.filterwarnings("ignore")
%matplotlib

stopwords = [line.strip() for line in codecs.open(
    r'./stopwords.txt', 'r', 'utf-8').readlines()]

Using matplotlib backend: Qt5Agg


In [2]:
text = pd.read_csv(r'../../Data/TRAINSET_NEWS.csv')
stock = pd.read_csv(r'../../Data/TRAINSET_STOCK.csv')

# Prepare target vector

In [3]:
stock.head()

Unnamed: 0,ts_code,trade_date,name,open,low,high,close,change,pct_change,vol,amount,pe,pb,y
0,801010,20140401,农林牧渔,1668.75,1668.54,1689.12,1689.07,22.13,1.33,34914.0,291113.0,41.51,2.77,1
1,801010,20140402,农林牧渔,1688.72,1684.53,1693.41,1692.24,3.17,0.19,36300.0,289020.0,41.63,2.79,1
2,801010,20140403,农林牧渔,1693.05,1679.85,1697.73,1685.71,-6.53,-0.39,31403.0,259464.0,41.38,2.78,0
3,801010,20140404,农林牧渔,1681.92,1680.34,1698.44,1698.25,12.54,0.74,28648.0,240940.0,41.76,2.8,1
4,801010,20140408,农林牧渔,1693.24,1692.22,1706.84,1706.84,8.59,0.51,35012.0,312423.0,42.0,2.79,1


In [4]:
target_df = (stock.loc[(stock['y'] == 1)]).groupby(['trade_date', 'ts_code'])[
    'y'].count().unstack().fillna(0).astype(int)

# Load word embedding

## Load word2vec

In [5]:
w2vmodel = word2vec.Word2Vec.load("./Word_Embedding_Model/w2v.model")

## Load GloVe

In [6]:
corpus_model = Corpus.load('./Word_Embedding_Model/glov_corpus.model')
glove = Glove.load('./Word_Embedding_Model/glove.model')

## Load Fasttext

In [7]:
fttmodel = FastText.load(r'./Word_Embedding_Model/fasttext.model')

## Concatenate vector

In [8]:
glove.word_vectors.shape, len(fttmodel.wv.vocab), len(w2vmodel.wv.vocab)

((126565, 100), 126565, 126565)

In [11]:
word_list = list(w2vmodel.wv.vocab.keys())
word_embedding = {}
word_index = {}

for i, w in tqdm(enumerate(word_list)):
    word_index[w] = i
    word_embedding[i] = np.concatenate(
        (w2vmodel[w], glove.word_vectors[glove.dictionary[w]], fttmodel[w]))

126565it [00:04, 29735.72it/s]


# Load word segment

In [None]:
seg_text = pd.read_csv(r'./Word_Embedding_Model/seg_words.csv')
seg_text.head()

## Merge news title from the same date

In [None]:
# join_title = text[['date', 'title']]
# join_title['title'] = join_title['title'].astype(str)
# join_title = join_title.groupby(
#     ['date'])['title'].apply(';'.join).reset_index()
# join_title.head()

## Merge news content from the same date

In [None]:
# join_content = text[['date', 'content']]
# join_content['content'] = join_content['content'].astype(str)
# join_content = join_content.groupby(
#     ['date'])['content'].apply(';'.join).reset_index()
# join_content.head()

# Contact title and content

In [None]:
# joined_text = join_title.merge(join_content, on='date')
# joined_text.head()

# Rolling text

In [None]:
# def func1(x):
#     print(x)
#     x = list(x)
#     x.reverse()
#     return ';'.join(x)

In [None]:
# joined_text['title'].rolling(window=5)