In [1]:
TWEETS_DIR = './obama-trump-data/tweets'
LABELS_DIR = './obama-trump-data/labels_np'

EMBEDDINGS_DIR = "./glove.6B.50d.txt"

In [2]:
import pickle

# Load tweets, labels
tweets = pickle.load(open(TWEETS_DIR,'rb'))
labels = pickle.load(open(LABELS_DIR,'rb'))

# Sample some tweets to display
for i in range(0,100,10):
    print ("Tweet: ", tweets[i], ". Label: ", labels[i])

Tweet:  "America should be very proud." President Obama #LoveWins . Label:  0
Tweet:  TO ALL AMERICANS
https://t.co/D7Es6ie4fY . Label:  1
Tweet:  No deal was made last night on DACA. Massive border security would have to be agreed to in exchange for consent. Would be subject to vote. . Label:  1
Tweet:  I was viciously attacked by Mr. Khan at the Democratic Convention. Am I not allowed to respond? Hillary voted for the Iraq war, not me! . Label:  1
Tweet:  .@RudyGiuliani, one of the finest people I know and a former GREAT Mayor of N.Y.C., just took himself out of consideration for "State". . Label:  1
Tweet:  The #TPP establishes the highest labor standards of any trade agreement in history. http://t.co/3vqoancyYp . Label:  0
Tweet:  Thoughts and prayers to the great people of Indiana. You will prevail! . Label:  1
Tweet:  Hillary said she was under sniper fire (while surrounded by USSS.) Turned out to be a total lie. She is not fit to https://t.co/hBIrGj21l6 . Label:  1
Tweet:  The f

In [3]:
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenize the tweets (convert sentence to sequence of words)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets)

sequences = tokenizer.texts_to_sequences(tweets)
word_index = tokenizer.word_index

print('Found %s unique tokens (words).' % len(word_index))

# Pad sequences to ensure samples are the same size
training_data = pad_sequences(sequences)

print("Training data size is (%d,%d)"  % training_data.shape) # shape = (num. tweets, max. length of tweet)
print("Labels are size %d"  % labels.shape)

Using TensorFlow backend.


Found 13049 unique tokens (words).
Training data size is (6136,31)
Labels are size 6136


In [4]:
# Show effect of tokenization, padding
print ("Original tweet: ", tweets[0])
print ("Tweet after tokenization and padding: ", training_data[0])

Original tweet:  "America should be very proud." President Obama #LoveWins
Tweet after tokenization and padding:  [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0  44  88  21  83 391  12  14 794]


In [5]:
# Convert words to word embedding vectors

EMBEDDING_DIM = 50
print('Indexing word vectors.')

import os
embeddings_index = {}
f = open(EMBEDDINGS_DIR)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# prepare word embedding matrix
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Indexing word vectors.
Found 400000 word vectors.


In [6]:
# Sample word embedding vector:
print(word_index["computer"]) # retrieve word index
print(embedding_matrix[2645]) # use word index to retrieve word embedding vector

2645
[ 0.079084   -0.81503999  1.79009998  0.91653001  0.10797    -0.55628002
 -0.84426999 -1.49510002  0.13417999  0.63626999  0.35146001  0.25813001
 -0.55028999  0.51055998  0.37408999  0.12092    -1.61660004  0.83653003
  0.14202    -0.52348     0.73452997  0.12207    -0.49079001  0.32532999
  0.45306    -1.58500004 -0.63848001 -1.00530005  0.10454    -0.42984
  3.18099999 -0.62186998  0.16819    -1.01390004  0.064058    0.57844001
 -0.45559999  0.73782998  0.37202999 -0.57722002  0.66441     0.055129
  0.037891    1.32749999  0.30991     0.50696999  1.23570001  0.1274
 -0.11434     0.20709001]


In [7]:
from keras.models import Sequential
from keras.layers import Embedding, Input
from keras.layers.merge import Concatenate
from keras.layers.core import Dense, Activation, Flatten
from keras.layers import Dropout, concatenate
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras import metrics
from keras.models import Model
import pickle

In [8]:
model = Sequential()
# Add pre-trained embedding layer 
# converts word indices to GloVe word embedding vectors as they're fed in
model.add(Embedding(len(word_index) + 1,
                    EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=training_data.shape[1],
                    trainable=False))

model.add(LSTM(units = 50, activation = 'tanh', recurrent_activation = 'hard_sigmoid', return_sequences = False))
model.add(Dense(units = 1))
model.add(Activation('sigmoid'))
print(model.summary())

LOSS = 'binary_crossentropy'
OPTIMIZER = 'RMSprop'

model.compile(loss = LOSS, optimizer = OPTIMIZER, metrics = [metrics.binary_accuracy])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 31, 50)            652500    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 672,751
Trainable params: 20,251
Non-trainable params: 652,500
_________________________________________________________________
None


In [9]:
EPOCHS = 8
BATCH_SIZE = 10

model.fit(training_data, labels, 
          epochs = EPOCHS, 
          batch_size = BATCH_SIZE, 
          validation_split =.2)

Train on 4908 samples, validate on 1228 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x11e0a72b0>