In [4]:
import numpy as np
import pandas as pd
import re

import tensorflow as tf
from tensorflow import keras
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, Reshape, Conv2D, Activation, MaxPooling2D, Flatten
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv("sample_data_label.csv")


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,docid,stock_code,author,headline,content,clean_content,translated_content,label
0,0,202000000000000.0,600340.sh,狼子會投,36元成本，七萬股，目標價50元,"<html>\n <meta http-equiv=""Content-Type"" conte...",36元成本，七萬股，目標價50元36元成本，七萬股，目標價50元36元成本，七萬股，目標價50元,"36 yuan cost, 70,000 shares, target price 50 y...",2
1,1,202000000000000.0,600340.sh,整齊皓月,華夏幸福36幹進來的，幾乎滿倉了，後市如何？下周要撤不？,"<html>\n <meta http-equiv=""Content-Type"" conte...",華夏幸福36幹進來的，幾乎滿倉了，後市如何？下周要撤不？華夏幸福36幹進來的，幾乎滿倉了，後...,"Huaxia Fortune 36 came in, almost full of ware...",2
2,2,202000000000000.0,600340.sh,思理紅嫩,華夏幸福36幹進來的，幾乎滿倉了，後市如何，大神指點下？,"<html>\n <meta http-equiv=""Content-Type"" conte...",華夏幸福36幹進來的，幾乎滿倉了，後市如何，大神指點下？華夏幸福36幹進來的，幾乎滿倉了，後...,"Huaxia Xingfu 36 came in, almost full of wareh...",2
3,3,202000000000000.0,600211.sh,再見理想123,從集中持股到資產配置，可這樣走,"<html>\n <meta http-equiv=""Content-Type"" conte...",從集中持股到資產配置，可這樣走從集中持股到資產配置，可這樣走周四大幅下殺是否另有玄機，周五又...,From centralized shareholding to asset allocat...,1
4,4,202000000000000.0,002718.sz,genglp,上市公司當下賣房屬於,"<html>\n <meta http-equiv=""Content-Type"" conte...",上市公司當下賣房屬於上市公司當下賣房屬於一是基於但由於業績下滑,The current house selling of listed companies ...,1


In [7]:
stop_words = set(stopwords.words('english'))
stop_words.add(('', ' ', '  ', '  ', '    ','   ', '    ', '   ',' s'))

In [8]:
def clean_doc(doc, vocab=None):
    tokens = word_tokenize(doc)
    # keeping only alphabets    
    tokens = [re.sub('[^a-zA-Z]', ' ', word) for word in tokens] 
    # converting to lowercase
    tokens = [word.lower() for word in tokens]
    # removing stopwords
    tokens = [w for w in tokens if not w in stop_words]
    # removing single characters if any
    tokens = [word for word in tokens if len(word) > 1]
    if vocab:
        tokens = [w for w in tokens if w in vocab]
        tokens = ' '.join(tokens)        
    return tokens

def add_doc_to_vocab(text, vocab):
    tokens = clean_doc(text)
    vocab.update(tokens)

def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [9]:
X = df['translated_content']
y = df['label']-1
y = np_utils.to_categorical(y)

In [10]:
y

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [11]:
# splitting into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [12]:
# creating a vocabulary of words
vocab = Counter()
len_train = len(X_train)
for i in range(len_train):
    text = X_train.iloc[i]
    add_doc_to_vocab(text , vocab)

In [13]:
print(len(vocab))
# print the 20 most common words
print(vocab.most_common(20))

4245
[('  ', 846), ('market', 340), ('stock', 298), ('    ', 270), ('   ', 208), ('yuan', 201), ('performance', 180), (' s', 179), ('stocks', 178), ('company', 166), ('     ', 164), ('shares', 152), ('companies', 151), ('      ', 144), ('year', 128), ('listed', 117), ('time', 114), ('price', 111), ('first', 104), ('electric', 102)]


In [14]:
# removing tokens which occur less than 3 times.
min_occurance = 2
tokens = [k for k,c in vocab.items() if (c >= min_occurance & len(k) > 1)]

In [15]:
# saving the vocabulary for futute use
save_list(tokens, 'vocab.txt')

In [16]:
# loading the saved vocabulary
vocab = load_doc('vocab.txt')
vocab = vocab.split()
vocab = set(vocab)


In [17]:
train_doc = []
for i in range(len_train):
    text = X_train.iloc[i]
    doc = clean_doc(text, vocab)
    train_doc.append(doc)

test_doc = []
len_test = len(X_test)
for i in range(len_test):
    text = X_test.iloc[i]
    doc = clean_doc(text, vocab)
    test_doc.append(doc)

In [18]:
# storing indexes where no tokens are present
index_train = []
for i in range(len(train_doc)):
    if len(train_doc[i]) == 0 :
        index_train.append(i)
    
index_test = []
for i in range(len(test_doc)):
    if len(test_doc[i]) == 0 :
        index_test.append(i)
    

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_doc)

X_train = tokenizer.texts_to_matrix(train_doc, mode='binary')
X_test = tokenizer.texts_to_matrix(test_doc, mode='binary')
n_words = X_test.shape[1]

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("Y_train.shape", y_train.shape)
print("Y_test.shape", y_train.shape)

X_train shape:  (900, 1272)
X_test shape:  (100, 1272)
Y_train.shape (900, 3)
Y_test.shape (900, 3)


In [39]:
# LSTM Model
model = Sequential()
model.add(Bidirectional(LSTM(120, activation='relu'), input_shape=(None, n_words)))
model.add(Dropout(0.2))
model.add(Dense(units=81, input_dim=100, activation='relu'))

model.add(Reshape((9,9,1)))
model.add(Conv2D(input_shape=(4,4),filters = 100,kernel_size = (3,4),padding='valid'),)
model.add(Activation('softmax'))
model.add(Dropout(0.5))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())

model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fitting the LSTM model
model.fit(X_train.reshape((-1, 1, n_words)), y_train, epochs=18, batch_size=108)

# finding test loss and test accuracy
loss_rnn, acc_rnn = model.evaluate(X_test.reshape((-1, 1, n_words)), y_test, verbose=0)
print(loss_rnn, acc_rnn)


Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18
1.1476779413223266 0.56


In [41]:
prediction = model.predict(X_test.reshape((-1, 1, n_words)))

In [57]:
predict_label = []
for i in range(len(prediction)):
    predict_label.append(prediction[i].argmax())
predict_label

[2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1]