In [1]:
%matplotlib inline

import os
from keras.utils import get_file, to_categorical
import gensim
import subprocess
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize
figsize(10, 10)

from sklearn.manifold import TSNE
import json
from collections import Counter
from itertools import chain

Using TensorFlow backend.


In [2]:
MODEL = 'GoogleNews-vectors-negative300.bin'
path = get_file(MODEL + '.gz', 'https://deeplearning4jblob.blob.core.windows.net/resources/wordvectors/%s.gz' % MODEL)
if not os.path.isdir('generated'):
    os.mkdir('generated')

unzipped = os.path.join('generated', MODEL)
if not os.path.isfile(unzipped):
    with open(unzipped, 'wb') as fout:
        zcat = subprocess.Popen(['zcat'],
                          stdin=open(path),
                          stdout=fout
                         )
        zcat.wait()

Downloading data from https://deeplearning4jblob.blob.core.windows.net/resources/wordvectors/GoogleNews-vectors-negative300.bin.gz


In [4]:
from google.colab import files
files.upload()

Saving Sentences_50Agree.txt to Sentences_50Agree.txt




In [5]:
!dir

generated  sample_data	Sentences_50Agree.txt


In [0]:
!mv Sentences_50Agree.txt generated

In [7]:
!dir generated

GoogleNews-vectors-negative300.bin  Sentences_50Agree.txt


In [8]:
from gensim.models import KeyedVectors

word2vec = KeyedVectors.load_word2vec_format(unzipped, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
INPUT_FILE = "generated/Sentences_50Agree.txt"
fin = open(INPUT_FILE, 'rb')

In [0]:
def label_to_numeric(x):
    if x=='positive':
        return 2
    if x=='neutral':
        return 1
    if x=='negative':
        return 0

In [0]:
sentences = []
labels = []

for line in fin:
  sent, label = line.strip().decode('latin-1').split("@")
  labels.append(label_to_numeric(label))
  sentences.append(sent)

In [0]:
# sentences = []
# labels = []

# for line in fin:
#   label, sent = line.strip().decode().split("\t")
#   labels.append(int(label))
#   sentences.append(sent)

In [0]:
VOCAB_SIZE = 12000
EMBEDDING_DIM = 300
MAX_LENGTH = 120
TRUNC_TYPE = "post"
OOV_TOK = "<OOV>"

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOK)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)

X = pad_sequences(sequences, maxlen=MAX_LENGTH, truncating=TRUNC_TYPE)
y = to_categorical(labels)

# X = np.reshape(X, (X.shape[0], X.shape[1],1))
# X = np.reshape(X, (X.shape[0], X.shape[1],1))

In [15]:
X.shape

(4846, 120)

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [0]:
embedding_weights = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, index in word_index.items():
  try:
    embedding_weights[index,:] = word2vec[word]
  except KeyError:
    pass

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM

model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH, weights=[embedding_weights]))
# model.add(SpatialDropout1D(0.2))
# model.add(Conv1D(filters=256, kernel_size=5, activation="relu"))
# model.add(GlobalMaxPooling1D())
# model.add(Dense(3, activation="softmax"))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64, activation="relu"))
model.add(Dense(3, activation="softmax"))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 300)          3600000   
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 128)          186880    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 3,832,451
Trainable params: 3,832,451
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=64, epochs=20, validation_data=(X_test, y_test))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 3392 samples, validate on 1454 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
score = model.evaluate(X_test, y_test, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))

Test score: 1.550, accuracy: 0.750


In [0]:
pred_sentences = ["Officials said lowering GST could leave a hole of Rs 55,000-60,000 cr if the tax is lowered from 28% to 18%."]
pred_sequences = tokenizer.texts_to_sequences(pred_sentences)
X_pred = pad_sequences(pred_sequences, maxlen=MAX_LENGTH, truncating=TRUNC_TYPE)
y_pred = model.predict(X_pred)
y_pred

array([[9.2547777e-04, 3.6163858e-04, 9.9871290e-01]], dtype=float32)