# Exploratory Data Analysis
In this notebook, we use the functions we've written in various notebooks to explore how our models' analysis on news sentiment actually compares with changes in stock prices.

In [1]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import compare_sentiment as comp_sent
import pandas as pd

## Google

In [2]:
#Get Data
google = comp_sent.collapse_articles(data_source = "./data/google-data.pkl", stockName = "GOOG", 
                      time_before = '2018-09-27', time_after = '2019-05-17')
apple = comp_sent.collapse_articles(data_source = "./data/apple-data.pkl", stockName = "AAPL", 
                      time_before = '2018-09-27', time_after = '2019-05-17')
tesla = comp_sent.collapse_articles(data_source = "./data/tesla-data.pkl", stockName = "TSLA", 
                      time_before = '2018-09-27', time_after = '2019-05-17')
micro = comp_sent.collapse_articles(data_source = "./data/Microsoft-data.pkl", stockName = "MSFT", 
                      time_before = '2018-09-27', time_after = '2019-05-17')

In [3]:
all_company = pd.concat([google, apple, tesla, micro])

In [4]:
all_company['liststring'] = all_company['liststring'].apply(lambda x : x.replace("," , " "))

In [5]:
all_company['increase'] = (all_company['delta'] >= 0).astype("int")

In [6]:
all_company.head()

Unnamed: 0_level_0,liststring,delta,increase
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-09-27,MANCHESTER England It was past midnight when J...,5.14,1
2018-09-28,You may have noticed a bold advertisement in T...,8.02,1
2018-10-01,A new sitcom airs on CBS while a new documenta...,-8.93,0
2018-10-02,Student Athlete a documentary with LeBron Jame...,14.04,1
2018-10-03,Apple opened a routine product launch event la...,-9.67,0


In [7]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(all_company['liststring'], all_company['increase'], test_size=0.33, random_state=42)

In [8]:
#Tokenize the words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_company['liststring'])

# max_length = max([len(s.split()) for s in all_company['liststring']])
max_length = 5000
print(max_length)

5000


In [9]:
vocab_size = len(tokenizer.word_index) + 1

X_train_tokens =  tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

#Architecture 1
EMBEDDING_DIM = 100

print('Build model...')

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=128,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Build model...
Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 5000, 100)         7008400   
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               87936     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 7,096,465
Trainable params: 7,096,465
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train on 363 samples, validate on 180 samples
Epoch 1/25
 - 32s - loss: 0.6907 - acc: 0.5455 - val_loss: 0.6929 - val_acc: 0.5056
Epoch 2/25
 - 32s - loss: 0.6862 - acc: 0.5730 - val_loss: 0.6956 - val_acc: 0.5056
Epoch 3/25
 - 30s - loss: 0.6782 - acc: 0.5730 - val_loss: 0.6984 - val_acc: 0.5056
Epoch 4/25
 - 29s - loss: 0.6721 - acc: 0.5730 - val_loss: 0.7016 - val_acc: 0.5056
Epoch 5/25
 - 33s - loss: 0.6630 - acc: 0.5730 - val_loss: 0.7050 - val_acc: 0.5056
Epoch 6/25
 - 37s - loss: 0.6506 - acc: 0.5813 - val_loss: 0.7076 - val_acc: 0.5111
Epoch 7/25
 - 35s - loss: 0.6351 - acc: 0.6006 - val_loss: 0.7140 - val_acc: 0.5111
Epoch 8/25
 - 37s - loss: 0.6103 - acc: 0.6529 - val_loss: 0.7216 - val_acc: 0.5167
Epoch 9/25
 - 34s - loss: 0.5790 - acc: 0.6997 - val_loss: 0.7429 - val_acc: 0.4833
Epoch 10/25
 - 33s - loss: 0.5426 - acc: 0.6942 - val_loss: 0.7551 - val_acc: 0.5056
Epoch 11/25
 - 34s - loss: 0.5151 - acc: 0.7493 - val_loss: 0.7200 - val_acc: 0.5056
Epoch 12/25
 - 32s - loss: 0

<keras.callbacks.History at 0x12eb1ad30>

In [16]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Flatten

#Architecture 2
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length, trainable = False))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the model
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 5000, 100)         7008400   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 4996, 128)         64128     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2498, 128)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 319744)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 319745    
Total params: 7,392,273
Trainable params: 383,873
Non-trainable params: 7,008,400
_________________________________________________________________
None
Train on 363 samples, validate on 180 samples
Epoch 1/25
 - 5s - loss: 1.5137 - acc: 0.5234 - val_loss: 0.9399 - val_acc:

<keras.callbacks.History at 0x135015828>