In [1]:
import time
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Flatten, Embedding, Dropout, BatchNormalization, Conv1D, MaxPool1D, SpatialDropout1D, LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.optimizers import Adam

Using TensorFlow backend.


## Data Exploration

In [3]:
tweets = pd.read_csv('tweets.csv')

In [4]:
tweets.head()

Unnamed: 0,X,tweet_text,directed_at,tweet_sentiment
0,1,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative
1,3,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive
2,5,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive
3,6,@teachntech00 New iPad Apps For #SpeechTherapy...,,neutral
4,7,,,


In [5]:
tweets.shape

(8936, 4)

In [6]:
negative_tweets = pd.read_csv('negative_tweet_supplement.csv')

In [7]:
negative_tweets.head()

Unnamed: 0,X,tweet_text,tweet_sentiment
0,11,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,negative
1,15,@apple Contact sync between Yosemite and iOS8 ...,negative
2,17,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,negative
3,24,"@Apple, For the love of GAWD, CENTER the '1'on...",negative
4,25,i get the storage almost full notification lit...,negative


In [8]:
negative_tweets.shape

(1219, 3)

In [9]:
airline_tweets = pd.read_csv('airline_tweets.csv')
airline_tweets.shape

(11567, 2)

In [10]:
airline_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,@VirginAmerica plus you've added commercials t...,positive
1,@VirginAmerica it's really aggressive to blast...,negative
2,@VirginAmerica and it's a really big bad thing...,negative
3,@VirginAmerica seriously would pay $30 a fligh...,negative
4,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [11]:
debate_tweets = pd.read_csv('gop_debate_tweets.csv')
debate_tweets.shape

(10262, 2)

In [12]:
debate_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,positive


## Data Preprocessing

In [13]:
tweets = tweets.drop('directed_at', axis=1)

In [14]:
all_tweets = tweets.append(negative_tweets)

In [15]:
all_tweets.shape

(10155, 3)

In [16]:
all_tweets = all_tweets.drop('X', axis=1)

In [17]:
all_tweets.shape

(10155, 2)

In [18]:
all_tweets = all_tweets.append(airline_tweets)
all_tweets = all_tweets.append(debate_tweets)

In [19]:
all_tweets.shape

(31984, 2)

In [20]:
def ingest(all_tweets):
    all_tweets = all_tweets[all_tweets['tweet_text'].isnull() == False]
    all_tweets = all_tweets[all_tweets['tweet_sentiment'].isnull() == False]
    return all_tweets

all_tweets = ingest(all_tweets)

In [21]:
all_tweets.shape

(31983, 2)

In [22]:
all_tweets[all_tweets['tweet_sentiment'] == 'negative'].shape, all_tweets[all_tweets['tweet_sentiment'] == 'neutral'].shape, all_tweets[all_tweets['tweet_sentiment'] == 'positive'].shape

((15138, 2), (10397, 2), (6448, 2))

In [23]:
#all_tweets['tweet_text'] = all_tweets['tweet_text'].apply(lambda x: x.lower())
#all_tweets['tweet_text'] = all_tweets['tweet_text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','', x)))

max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(all_tweets['tweet_text'].values)
X = tokenizer.texts_to_sequences(all_tweets['tweet_text'].values)
X = pad_sequences(X)

In [24]:
X.shape

(31983, 50)

In [25]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    6,   34,    4, 1721,
         53,  103,  134,  382,  963,   16, 2639,   84,   23,   32, 1290,
          6,   88,    2,  666,   16,    3], dtype=int32)

## Training Data

In [26]:
Y = pd.get_dummies(all_tweets['tweet_sentiment']).values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [27]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28784, 50), (28784, 3), (3199, 50), (3199, 3))

In [28]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0, 1542,   35,  173, 2306, 3002, 1570,   45,   31, 3002,   20,
        109,    8,  808,  178,    3,   22], dtype=int32)

In [29]:
y_train[0]

array([0, 0, 1], dtype=uint8)

## Three models: Simple Neural Network, CNN, and LSTM

### Single hidden layer NN

In [None]:
embed_dim = 32

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model.add(Dropout(0.7))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.7))
model.add(Dense(3, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
batch_size = 64

early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_simple', verbose=1, save_best_only=True)

start = time.time()

model.fit(X_train, y_train, batch_size=batch_size, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model = load_model('tweet_sentiment_simple')

y_pred = model.predict(X_test, batch_size=64)
score = model.evaluate(X_test, y_test, verbose=1)

print(score)

### Single conv layer with max pooling

In [None]:
embed_dim = 128

model2 = Sequential()
model2.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model2.add(SpatialDropout1D(0.3))
model2.add(Dropout(0.6))
model2.add(Conv1D(32, 5, padding='same', activation='relu'))
model2.add(Dropout(0.5))
model2.add(MaxPool1D(5))
model2.add(Flatten())
model2.add(Dense(100, activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.7))
model2.add(Dense(3, activation='softmax'))

In [None]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_cnn', verbose=1, save_best_only=True)

start = time.time()

model2.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model2 = load_model('tweet_sentiment_cnn')

y_pred = model2.predict(X_test, batch_size=64)
score = model2.evaluate(X_test, y_test, verbose=1)

print(score)

### Multi-layer CNN

In [177]:
embed_dim = 32

model3 = Sequential()
model3.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1], embeddings_regularizer=l2(0.0001)))
model3.add(SpatialDropout1D(0.2))
model3.add(Dropout(0.6))

model3.add(Conv1D(128, 5, activation='relu'))
model3.add(Dropout(0.6))
model3.add(MaxPool1D())

model3.add(Flatten())

model3.add(Dense(256, activation='relu', kernel_regularizer=l2(0.0001)))
model3.add(Dropout(0.7))
model3.add(Dense(3, activation='softmax'))

In [178]:
#model3.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001, decay=1e-7), metrics=['accuracy'])
model3.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_50 (Embedding)     (None, 50, 32)            160000    
_________________________________________________________________
spatial_dropout1d_48 (Spatia (None, 50, 32)            0         
_________________________________________________________________
dropout_173 (Dropout)        (None, 50, 32)            0         
_________________________________________________________________
conv1d_92 (Conv1D)           (None, 46, 128)           20608     
_________________________________________________________________
dropout_174 (Dropout)        (None, 46, 128)           0         
_________________________________________________________________
max_pooling1d_75 (MaxPooling (None, 23, 128)           0         
_________________________________________________________________
flatten_43 (Flatten)         (None, 2944)              0         
__________

In [180]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_multicnn', verbose=1, save_best_only=True)

start = time.time()

model3.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model3 = load_model('tweet_sentiment_multicnn')

y_pred = model3.predict(X_test, batch_size=64)
score = model3.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.69594, saving model to tweet_sentiment_multicnn
13s - loss: 0.6493 - acc: 0.7389 - val_loss: 0.6959 - val_acc: 0.7131
Epoch 2/30
Epoch 00001: val_loss did not improve
11s - loss: 0.6518 - acc: 0.7381 - val_loss: 0.6984 - val_acc: 0.7127
Epoch 3/30
Epoch 00002: val_loss improved from 0.69594 to 0.69433, saving model to tweet_sentiment_multicnn
11s - loss: 0.6477 - acc: 0.7412 - val_loss: 0.6943 - val_acc: 0.7110
Epoch 4/30
Epoch 00003: val_loss did not improve
11s - loss: 0.6460 - acc: 0.7450 - val_loss: 0.6980 - val_acc: 0.7110
Epoch 5/30
Epoch 00004: val_loss did not improve
11s - loss: 0.6476 - acc: 0.7406 - val_loss: 0.6985 - val_acc: 0.7152
Epoch 6/30
Epoch 00005: val_loss did not improve
11s - loss: 0.6433 - acc: 0.7449 - val_loss: 0.6965 - val_acc: 0.7166
Model took 72.11 seconds to train


### LSTM

In [46]:
embed_dim = 64
lstm_out = 96

model3 = Sequential()
model3.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model3.add(BatchNormalization())
model3.add(Dropout(0.5))
model3.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.3, activation='relu'))
model3.add(Dense(75, activation='relu'))
model3.add(Dropout(0.7))
model3.add(Dense(3, activation='softmax'))
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 50, 64)            320000    
_________________________________________________________________
batch_normalization_9 (Batch (None, 50, 64)            256       
_________________________________________________________________
dropout_12 (Dropout)         (None, 50, 64)            0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 96)                61824     
_________________________________________________________________
dense_11 (Dense)             (None, 75)                7275      
_________________________________________________________________
dropout_13 (Dropout)         (None, 75)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 228       
Total para

In [47]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_lstm', verbose=1, save_best_only=True)

start = time.time()

model3.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model3 = load_model('tweet_sentiment_lstm')

y_pred = model3.predict(X_test, batch_size=64)
score = model3.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 1.00594, saving model to tweet_sentiment_lstm
61s - loss: 0.9313 - acc: 0.5772 - val_loss: 1.0059 - val_acc: 0.6235
Epoch 2/30
Epoch 00001: val_loss improved from 1.00594 to 0.74491, saving model to tweet_sentiment_lstm
58s - loss: 0.7781 - acc: 0.6480 - val_loss: 0.7449 - val_acc: 0.6447
Epoch 3/30
Epoch 00002: val_loss improved from 0.74491 to 0.71361, saving model to tweet_sentiment_lstm
58s - loss: 0.7277 - acc: 0.6707 - val_loss: 0.7136 - val_acc: 0.6843
Epoch 4/30
Epoch 00003: val_loss improved from 0.71361 to 0.70809, saving model to tweet_sentiment_lstm
56s - loss: 0.6924 - acc: 0.6925 - val_loss: 0.7081 - val_acc: 0.6933
Epoch 5/30
Epoch 00004: val_loss improved from 0.70809 to 0.69929, saving model to tweet_sentiment_lstm
54s - loss: 0.6625 - acc: 0.7114 - val_loss: 0.6993 - val_acc: 0.7051
Epoch 6/30
Epoch 00005: val_loss improved from 0.69929 to 0.68780, saving model to tw

### Pre-trained Model (GloVe-based)

In [None]:
embeddings_index = {}
f = open(os.path.join('glove.twitter.27B', 'glove.twitter.27B.25d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [31]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 34043 unique tokens.


In [None]:
embed_dim = 32
embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector