In [133]:
#Imports
from __future__ import print_function

import os
import os.path
import sys
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [134]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer

In [135]:
import sklearn
from sklearn.cross_validation import train_test_split

In [136]:
%matplotlib inline
sns.set_style('white')
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

In [235]:
#GLOVE_DIR = os.path.expanduser('~\\data\\')
MAX_SEQUENCE_LENGTH = 20
MAX_NB_WORDS = 1500
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [138]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {} # word -> coefs
# We'll use the 100-dimensional version
with open('glove.6B.100d.txt',encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400001 word vectors.


In [139]:
import pandas as pd
df = pd.read_csv('dataset.csv')

In [140]:
df.head()

Unnamed: 0,item_id,sentiment,SentimentSource,SentimentText
0,2,0.43,Morrisons,Morrisons book second consecutive quarter of s...
1,3,-0.344,IMI,IMI posts drop in first-quarter organic revenu...
2,4,0.34,Glencore,Glencore to refinance its short-term debt earl...
3,5,0.259,Ryanair,EasyJet attracts more passengers in June but s...
4,6,-0.231,Barclays,Barclays 'bad bank' chief to step down


In [141]:
x= df['SentimentText']

In [142]:
from collections import defaultdict

In [143]:
a=' '.join(x)

In [144]:
pos_index = defaultdict(list)
for pos, term in enumerate(a.split()):
    pos_index[term].append(pos)

In [145]:
type(pos_index)

collections.defaultdict

In [146]:
df['new_senti']=df['sentiment'].apply(lambda row: 0.0 if row <= 0 else 1.0)

In [147]:
#y=df['sentiment']
y=df['new_senti']

In [149]:
import random
random.seed(2)

In [150]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20, random_state=2)

In [152]:
len(x_train)

913

In [153]:
maxlen =20

In [154]:
tokenizer = Tokenizer(num_words=maxlen*20)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
data = pad_sequences(sequences, maxlen)
data.shape

(913, 20)

In [155]:
tokenizer = Tokenizer(num_words=maxlen*20)
tokenizer.fit_on_texts(x_test)
sequences = tokenizer.texts_to_sequences(x_test)
data1 = pad_sequences(sequences, maxlen)
data1.shape

(229, 20)

In [156]:
tokenizer1 = Tokenizer(num_words=maxlen*20)
tokenizer1.fit_on_texts(x)
sequences = tokenizer1.texts_to_sequences(x)
full_data = pad_sequences(sequences, maxlen)
full_data.shape

(1142, 20)

In [157]:
word_index=tokenizer.word_index

In [158]:
len(word_index)

1088

In [159]:

print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index)+1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.


In [160]:
len(embedding_matrix)

1089

In [207]:
# train a regular MLP
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='float32')
embedded_sequences = embedding_layer(sequence_input)
x = Flatten()(embedded_sequences)
x = Dense(256, activation='linear')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='linear')(x)
preds = Dense(1, activation='tanh')(x)

model_mlp = Model(sequence_input, preds)
model_mlp.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['accuracy'])

In [208]:
model_mlp.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_26 (InputLayer)        (None, 20)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 20, 100)           108900    
_________________________________________________________________
flatten_23 (Flatten)         (None, 2000)              0         
_________________________________________________________________
dense_65 (Dense)             (None, 256)               512256    
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_66 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_67 (Dense)             (None, 1)                 257       
Total para

In [209]:
import tensorflow as tf

In [210]:
with tf.device('/cpu:0'):
    print('Training model.')
    history = model_mlp.fit(data, y_train, validation_data=(data1, y_test),
          epochs=10, batch_size=64)

Training model.
Train on 913 samples, validate on 229 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [211]:
# Evaluation on the test set
scores = model_mlp.evaluate(data1, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 59.83%


In [36]:
pool_size = (3)

In [212]:

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='linear')(embedded_sequences)
#x = MaxPooling1D(pool_size=pool_size)(x)
x = Conv1D(128, 5, activation='linear')(x)
#x=LSTM(128, dropout=0.3, recurrent_dropout=0.15)
#x = MaxPooling1D(pool_size=pool_size)(x)
x = Conv1D(128, 5, activation='linear')(x)
#x = MaxPooling1D(pool_size=pool_size)(x)  # modified from example since our seq len is 300 
x = Flatten()(x)
x = Dense(128, activation='linear')(x)
preds = Dense(1, activation='tanh')(x)



In [213]:

model = Model(sequence_input, preds)
model.compile(loss='cosine_proximity',
              optimizer='adam',
              metrics=['acc'])

In [214]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, 20)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 20, 100)           108900    
_________________________________________________________________
conv1d_49 (Conv1D)           (None, 16, 128)           64128     
_________________________________________________________________
conv1d_50 (Conv1D)           (None, 12, 128)           82048     
_________________________________________________________________
conv1d_51 (Conv1D)           (None, 8, 128)            82048     
_________________________________________________________________
flatten_24 (Flatten)         (None, 1024)              0         
_________________________________________________________________
dense_68 (Dense)             (None, 128)               131200    
__________

In [241]:
dataa=data.reshape(-1,20,1)

In [249]:
dataa1=data1.reshape(-1,20,1)

In [250]:
dataa1.shape

(229, 20, 1)

In [215]:
print('Training model.')
history = model.fit(data, y_train, validation_data=(data1, y_test),
          epochs=20, batch_size=64)

Training model.
Train on 913 samples, validate on 229 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [216]:
# Evaluation on the test set
scores = model.evaluate(data1, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 59.83%


In [220]:
predict_result = model.predict(data1)
predict_result1 = model_mlp.predict(data1)

In [221]:
predict_result1

array([[ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 0.99999958],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 0.99999994],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 0.99999905],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 0.99999642],
       [ 1.        ],
       [ 1.        ],
       [ 0.99999964],
       [ 1.        ],
       [ 0.99999952],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [ 1

In [222]:
from sklearn.metrics.pairwise import cosine_similarity

In [223]:
cx= predict_result.reshape(1,-1)
cy= y_test.reshape(1,-1)

In [224]:
cosine_similarity(cx,cy)

array([[ 0.77342538]])

In [228]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')


In [246]:
hidden_units = 100
nb_classes = 1
word_dim = 100

In [258]:
model = Sequential()
#model.add(embedding_layer(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')))
model.add(LSTM(128,input_shape=(20,1)))
#model.add(LSTM(128,output_dim=hidden_units, return_sequences =True))
#model.add(LSTM(output_dim=hidden_units, return_sequences =False))
model.add(Dropout(0.2))
model.add(Dense(nb_classes))
model.add(Activation('tanh'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

print("Train...")
#early_stopping = EarlyStopping(monitor='val_loss', patience=2) 

Train...


In [259]:
print('Training model.')
history = model.fit(dataa, y_train, validation_data=(dataa1, y_test),
          epochs=20, batch_size=64)

Training model.
Train on 913 samples, validate on 229 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
