In [1]:
# DS - updated imports
import json
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
#from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano'
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
from keras.models import model_from_json

Using Theano backend.


In [2]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [3]:
tweets = pd.read_excel("text_classification_dataset.xlsx")

In [4]:
macronum=sorted(set(tweets['type']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

def fun(i):
    return macro_to_id[i]

tweets['type']=tweets['type'].apply(fun)

In [5]:
tweets['type'].unique()

array([3, 0, 1, 2])

In [6]:
labels = []
for idx in tweets['type']:
    labels.append(idx)

In [7]:
texts = []
for tweet in tweets['text']:
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) 
    print (tweet)
    texts.append(tweet)

loomberg holds 7-point lead on trump in michigan, poll shows 

URL via AT_USER
AT_USER AT_USER AT_USER AT_USER hi kim, what are your thoughts on this:

URL
rt AT_USER hi AT_USER you’ve been misinformed. my father worked at gm for 30+ yrs in quebec &amp; ontario before he ran for office. by…
AT_USER nothing to do with racism that’s just the intolerant loony identity politics lefties narrative. bri… URL
rt AT_USER he should withdraw his plea.
 he's not guilty. fbi is corrupt. they tricked him. we need justice for gen flynn. an ameri…
rt AT_USER i absolutely laugh 😂🤣😂 when these two tells us all they’re going to create jobs. neither guy, outside of politics, ever h…
rt AT_USER trump is signing a deal with china that no one has seen. no copy has been made available to congress or anyone else. nor…
the weaponization of woke pc id politics that has made this website so god, damn, insufferable, is on full display… URL
17.  cnn fails to ask a single immigration question URL trump news
rt AT_US

In [8]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index

print('Number of Unique Tokens',len(word_index))

('Number of Unique Tokens', 5817)


In [9]:
# DS - Let's save this out so we can use it later
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(word_index, dictionary_file)

In [10]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

('Shape of Data Tensor:', (1162, 1000))
('Shape of Label Tensor:', (1162, 4))


In [None]:
embeddings_index = {}
# f = open('glove.6B.100d.txt',encoding='utf8')
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

In [12]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [13]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [14]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
preds = Dense(len(macronum), activation='softmax')(l_lstm)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Bidirectional LSTM")
model.summary()

Bidirectional LSTM
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         581800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 804       
Total params: 743,404
Trainable params: 743,404
Non-trainable params: 0
_________________________________________________________________


In [15]:
cp=ModelCheckpoint('model_rnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=3, batch_size=2,callbacks=[cp])

Train on 930 samples, validate on 232 samples
Epoch 1/3

Epoch 00001: val_acc improved from -inf to 0.28017, saving model to model_rnn.hdf5
Epoch 2/3

Epoch 00002: val_acc improved from 0.28017 to 0.29310, saving model to model_rnn.hdf5
Epoch 3/3

In [1]:
# DS - also need this
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

NameError: name 'model' is not defined

In [2]:
# DS - also need this
model.save_weights("model.h5")

NameError: name 'model' is not defined