In [0]:
# import libraries
from google.colab import drive
drive.mount('/content/gdrive')
import nltk, re, time
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using TensorFlow backend.


In [0]:
# load data
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/two_sigma_dataset/news_sample.csv')

In [0]:
# check out columns
data.columns

Index(['time', 'sourceTimestamp', 'firstCreated', 'sourceId', 'headline',
       'urgency', 'takeSequence', 'provider', 'subjects', 'audiences',
       'bodySize', 'companyCount', 'headlineTag', 'marketCommentary',
       'sentenceCount', 'wordCount', 'assetCodes', 'assetName',
       'firstMentionSentence', 'relevance', 'sentimentClass',
       'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
       'sentimentWordCount', 'noveltyCount12H', 'noveltyCount24H',
       'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
       'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
       'volumeCounts7D'],
      dtype='object')

In [0]:
# select the relevant columns
df = data[['headline','sentimentClass']]

In [0]:
# build a function that cleanse the data
def clean_text(headline):
  """cleaning data"""
  headline = headline.lower().split()
  stopword = set(stopwords.words("english"))
  headline = [word for word in headline if not word in stopword]
  headline = " ".join(headline)
  headline = re.sub('[^a-zA-z0-9\s]','',headline)
  return headline

In [0]:
# before cleaning
df['headline'].head(3)

0    China's Daqing pumps 43.41 mln tonnes of oil i...
1            FEATURE-In kidnapping, finesse works best
2           PRESS DIGEST - Wall Street Journal - Jan 1
Name: headline, dtype: object

In [0]:
# after cleaning
df['headline'].apply(clean_text).head(3)

0    chinas daqing pumps 4341 mln tonnes oil 06
1       featurein kidnapping finesse works best
2      press digest  wall street journal  jan 1
Name: headline, dtype: object

In [0]:
# apply the cleaning data function to the 'headline' columns of df
df['headline'] = df['headline'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
# build a function that tokenize the headlines
def tokenize(feature):
  max_features = 50
  tokenizer = Tokenizer(num_words=max_features, split=' ')
  tokenizer.fit_on_texts(feature.values)
  X = tokenizer.texts_to_sequences(feature.values)
  X = pad_sequences(X)
  return X

In [0]:
# build a function that categorise the target variable
def categorize(target):
  if target == 0:
    target = 'Neutral'
  elif target == 1:
    target = 'Positive'
  else:
    target = 'Negative'   
  return target

In [0]:

X = df['headline']
y = df['sentimentClass']

# tokenize the headlines which will be vectorised so that it can be fed in the neural network
X = tokenize(X)

# categorise the target and then do one-hot-encoding so that it can be fed in the neural network
y = y.apply(categorize)
y = pd.get_dummies(y).values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [0]:
# define the hyperparameters
embed_dim = 128
lstm_out  = 196
max_features = 200

#network architecture
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics= ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 7, 128)            25600     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 7, 128)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 591       
Total params: 280,991
Trainable params: 280,991
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model.fit(X_train, y_train, epochs = 10, batch_size = 32, verbose = 2)

Epoch 1/100
 - 3s - loss: 1.1009 - acc: 0.2857
Epoch 2/100
 - 0s - loss: 1.0927 - acc: 0.4286
Epoch 3/100
 - 0s - loss: 1.0915 - acc: 0.4429
Epoch 4/100
 - 0s - loss: 1.0791 - acc: 0.4429
Epoch 5/100
 - 0s - loss: 1.0763 - acc: 0.4286
Epoch 6/100
 - 0s - loss: 1.0676 - acc: 0.4714
Epoch 7/100
 - 0s - loss: 1.0555 - acc: 0.4571
Epoch 8/100
 - 0s - loss: 1.0512 - acc: 0.4857
Epoch 9/100
 - 0s - loss: 1.0428 - acc: 0.5000
Epoch 10/100
 - 0s - loss: 1.0345 - acc: 0.5286
Epoch 11/100
 - 0s - loss: 1.0109 - acc: 0.5429
Epoch 12/100
 - 0s - loss: 1.0119 - acc: 0.5000
Epoch 13/100
 - 0s - loss: 0.9865 - acc: 0.5000
Epoch 14/100
 - 0s - loss: 0.9751 - acc: 0.5429
Epoch 15/100
 - 0s - loss: 0.9333 - acc: 0.5143
Epoch 16/100
 - 0s - loss: 0.9286 - acc: 0.5714
Epoch 17/100
 - 0s - loss: 0.9306 - acc: 0.5429
Epoch 18/100
 - 0s - loss: 0.9123 - acc: 0.5429
Epoch 19/100
 - 0s - loss: 0.8906 - acc: 0.5429
Epoch 20/100
 - 0s - loss: 0.8730 - acc: 0.6429
Epoch 21/100
 - 0s - loss: 0.8384 - acc: 0.5714
E

<keras.callbacks.History at 0x7f9a9f9e1da0>

In [0]:
score, acc = model.evaluate(X_test, y_test, verbose =2, batch_size=32)
print('Score %.2f' % (score))
print("Validation Accuracy: %.2f" %(acc))

Score 1.43
Validation Accuracy: 0.47


In [0]:
# save the model
model.save('/content/gdrive/My Drive/Colab Notebooks/two_sigma_dataset/ordinary_lstm.h5')

In [0]:
! pip install vaderSentiment unidecode textblob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from unidecode import unidecode
from textblob import TextBlob



After building a LSTM RNN, we would also like to know the accuracy of Vader and Textblob

In [0]:
# categorize the sentiment class
df['sentimentClass'] = df['sentimentClass'].apply(categorize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
df['vader_sentiment'] = df['sentimentClass'].copy()
df['textblob_sentiment'] = df['sentimentClass'].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [0]:
def sentiment_analytics(headline):
    analyzer = SentimentIntensityAnalyzer()
    vader_sentiment = analyzer.polarity_scores(headline)['compound']
    textblob_sentiment = TextBlob(headline).sentiment[0]
    return {'textblob':textblob_sentiment, 'vader':vader_sentiment}
  
def thershold(x):
  if x == 0:
    x = 'Neutral'
  elif x > 0:
    x = 'Positive'
  else:
    x = 'Negative'
  return x 

In [0]:
df['vader_sentiment'] = df['vader_sentiment'].apply(lambda x: sentiment_analytics(x)['vader'])
df['vader_sentiment'] = df['vader_sentiment'].apply(lambda x: thershold(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [0]:
df['textblob_sentiment'] = df['textblob_sentiment'].apply(lambda x: sentiment_analytics(x)['textblob'])
df['textblob_sentiment'] = df['textblob_sentiment'].apply(lambda x: thershold(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [0]:
df['vader_acc'] = df['vader_sentiment'].copy()
df['textblob_acc'] = df['textblob_sentiment'].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [0]:
for i in range(len(df)):
  if df['textblob_acc'][i] == df['sentimentClass'][i]:
    df['textblob_acc'][i] = 'Match'
  else:
    df['textblob_acc'][i] = 'Mismatch'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [0]:
df['textblob_acc'].value_counts()

Match    100
Name: textblob_acc, dtype: int64

In [0]:
for i in range(len(df)):
  if df['vader_acc'][i] == df['sentimentClass'][i]:
    df['vader_acc'][i] = 'Match'
  else:
    df['vader_acc'][i] = 'Mismatch'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [0]:
df[df['vader_acc'] == 'Match'].shape[0] / len(df) * 100

100.0