In [24]:
import time
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Flatten, Embedding, Dropout, BatchNormalization, Conv1D, MaxPool1D, SpatialDropout1D, LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.optimizers import Adam
from nltk.corpus import stopwords

## Data Exploration

In [3]:
tweets = pd.read_csv('tweets.csv')

In [4]:
tweets.head()

Unnamed: 0,X,tweet_text,directed_at,tweet_sentiment
0,1,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative
1,3,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive
2,5,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive
3,6,@teachntech00 New iPad Apps For #SpeechTherapy...,,neutral
4,7,,,


In [5]:
tweets.shape

(8936, 4)

In [6]:
negative_tweets = pd.read_csv('negative_tweet_supplement.csv')

In [7]:
negative_tweets.head()

Unnamed: 0,X,tweet_text,tweet_sentiment
0,11,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,negative
1,15,@apple Contact sync between Yosemite and iOS8 ...,negative
2,17,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,negative
3,24,"@Apple, For the love of GAWD, CENTER the '1'on...",negative
4,25,i get the storage almost full notification lit...,negative


In [8]:
negative_tweets.shape

(1219, 3)

In [9]:
airline_tweets = pd.read_csv('airline_tweets.csv')
airline_tweets.shape

(11567, 2)

In [10]:
airline_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,@VirginAmerica plus you've added commercials t...,positive
1,@VirginAmerica it's really aggressive to blast...,negative
2,@VirginAmerica and it's a really big bad thing...,negative
3,@VirginAmerica seriously would pay $30 a fligh...,negative
4,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [11]:
debate_tweets = pd.read_csv('gop_debate_tweets.csv')
debate_tweets.shape

(10262, 2)

In [12]:
debate_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,positive


## Data Preprocessing

In [13]:
tweets = tweets.drop('directed_at', axis=1)

In [14]:
all_tweets = tweets.append(negative_tweets)

In [15]:
all_tweets.shape

(10155, 3)

In [16]:
all_tweets = all_tweets.drop('X', axis=1)

In [17]:
all_tweets.shape

(10155, 2)

In [18]:
all_tweets = all_tweets.append(airline_tweets)
all_tweets = all_tweets.append(debate_tweets)

In [19]:
all_tweets.shape

(31984, 2)

In [20]:
def ingest(all_tweets):
    all_tweets = all_tweets[all_tweets['tweet_text'].isnull() == False]
    all_tweets = all_tweets[all_tweets['tweet_sentiment'].isnull() == False]
    return all_tweets

all_tweets = ingest(all_tweets)

In [21]:
all_tweets.shape

(31983, 2)

In [23]:
all_tweets[all_tweets['tweet_sentiment'] == 'negative'].shape, all_tweets[all_tweets['tweet_sentiment'] == 'neutral'].shape, all_tweets[all_tweets['tweet_sentiment'] == 'positive'].shape

((15138, 2), (10397, 2), (6448, 2))

In [50]:
def remove_stopwords(tknzr):
    sw = stopwords.words('english')
    
    words_to_remove = []
    for word, idx in tknzr.word_index.items():
        if word in sw:
            words_to_remove.append(word)

    print(len(tknzr.word_index))        
    for w in words_to_remove:
        tknzr.word_index.pop(w)

    print(len(tknzr.word_index))
    
    return tknzr

    

In [74]:
#all_tweets['tweet_text'] = all_tweets['tweet_text'].apply(lambda x: x.lower())
#all_tweets['tweet_text'] = all_tweets['tweet_text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','', x)))

max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(all_tweets['tweet_text'].values)
tokenizer = remove_stopwords(tokenizer)

X = tokenizer.texts_to_sequences(all_tweets['tweet_text'].values)
X = pad_sequences(X)

34043
33908


In [75]:
X.shape

(31983, 46)

In [76]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 1721,   53,  134,  382,  963, 2639,   84, 1290,   88,
        666,    3], dtype=int32)

## Training Data

In [77]:
Y = pd.get_dummies(all_tweets['tweet_sentiment']).values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [78]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28784, 46), (28784, 3), (3199, 46), (3199, 3))

In [79]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
       1542, 2306, 3002, 1570,   31, 3002,   20,  109,    8,  808,  178,
          3,   22], dtype=int32)

In [80]:
y_train[0]

array([0, 0, 1], dtype=uint8)

## Three models: Simple Neural Network, CNN, and LSTM

### Single hidden layer NN

In [81]:
embed_dim = 32

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model.add(Dropout(0.7))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.7))
model.add(Dense(3, activation='softmax'))

In [82]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 46, 32)            160000    
_________________________________________________________________
dropout_15 (Dropout)         (None, 46, 32)            0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 1472)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 100)               147300    
_________________________________________________________________
batch_normalization_6 (Batch (None, 100)               400       
_________________________________________________________________
dropout_16 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 303       
Total para

In [83]:
batch_size = 64

early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_simple', verbose=1, save_best_only=True)

start = time.time()

model.fit(X_train, y_train, batch_size=batch_size, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model = load_model('tweet_sentiment_simple')

y_pred = model.predict(X_test, batch_size=64)
score = model.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.88087, saving model to tweet_sentiment_simple
3s - loss: 1.0201 - acc: 0.5316 - val_loss: 0.8809 - val_acc: 0.6186
Epoch 2/30
Epoch 00001: val_loss improved from 0.88087 to 0.75575, saving model to tweet_sentiment_simple
1s - loss: 0.8158 - acc: 0.6340 - val_loss: 0.7558 - val_acc: 0.6384
Epoch 3/30
Epoch 00002: val_loss improved from 0.75575 to 0.74093, saving model to tweet_sentiment_simple
1s - loss: 0.7651 - acc: 0.6473 - val_loss: 0.7409 - val_acc: 0.6481
Epoch 4/30
Epoch 00003: val_loss did not improve
1s - loss: 0.7347 - acc: 0.6566 - val_loss: 0.7473 - val_acc: 0.6422
Epoch 5/30
Epoch 00004: val_loss did not improve
1s - loss: 0.7142 - acc: 0.6683 - val_loss: 0.7417 - val_acc: 0.6523
Epoch 6/30
Epoch 00005: val_loss improved from 0.74093 to 0.73834, saving model to tweet_sentiment_simple
1s - loss: 0.6910 - acc: 0.6833 - val_loss: 0.7383 - val_acc: 0.6586
Epoch 7/30
Epoch 00

### Single conv layer with max pooling

In [84]:
embed_dim = 128

model2 = Sequential()
model2.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model2.add(SpatialDropout1D(0.3))
model2.add(Dropout(0.6))
model2.add(Conv1D(32, 5, padding='same', activation='relu'))
model2.add(Dropout(0.5))
model2.add(MaxPool1D(5))
model2.add(Flatten())
model2.add(Dense(100, activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.7))
model2.add(Dense(3, activation='softmax'))

In [85]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 46, 128)           640000    
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 46, 128)           0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 46, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 46, 32)            20512     
_________________________________________________________________
dropout_18 (Dropout)         (None, 46, 32)            0         
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 9, 32)             0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 288)               0         
__________

In [86]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_cnn', verbose=1, save_best_only=True)

start = time.time()

model2.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model2 = load_model('tweet_sentiment_cnn')

y_pred = model2.predict(X_test, batch_size=64)
score = model2.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.89667, saving model to tweet_sentiment_cnn
10s - loss: 1.0730 - acc: 0.5185 - val_loss: 0.8967 - val_acc: 0.5992
Epoch 2/30
Epoch 00001: val_loss improved from 0.89667 to 0.87036, saving model to tweet_sentiment_cnn
10s - loss: 0.8124 - acc: 0.6381 - val_loss: 0.8704 - val_acc: 0.5929
Epoch 3/30
Epoch 00002: val_loss improved from 0.87036 to 0.82251, saving model to tweet_sentiment_cnn
10s - loss: 0.7586 - acc: 0.6521 - val_loss: 0.8225 - val_acc: 0.6144
Epoch 4/30
Epoch 00003: val_loss improved from 0.82251 to 0.81529, saving model to tweet_sentiment_cnn
9s - loss: 0.7294 - acc: 0.6590 - val_loss: 0.8153 - val_acc: 0.6259
Epoch 5/30
Epoch 00004: val_loss improved from 0.81529 to 0.78933, saving model to tweet_sentiment_cnn
9s - loss: 0.7093 - acc: 0.6703 - val_loss: 0.7893 - val_acc: 0.6322
Epoch 6/30
Epoch 00005: val_loss improved from 0.78933 to 0.76904, saving model to tweet_sen

### Multi-layer CNN

In [87]:
embed_dim = 32

model3 = Sequential()
model3.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1], embeddings_regularizer=l2(0.0001)))
model3.add(SpatialDropout1D(0.2))
model3.add(Dropout(0.6))

model3.add(Conv1D(128, 5, activation='relu'))
model3.add(Dropout(0.6))
model3.add(MaxPool1D())

model3.add(Flatten())

model3.add(Dense(256, activation='relu', kernel_regularizer=l2(0.0001)))
model3.add(Dropout(0.7))
model3.add(Dense(3, activation='softmax'))

In [89]:
#model3.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001, decay=1e-7), metrics=['accuracy'])
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 46, 32)            160000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 46, 32)            0         
_________________________________________________________________
dropout_20 (Dropout)         (None, 46, 32)            0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 42, 128)           20608     
_________________________________________________________________
dropout_21 (Dropout)         (None, 42, 128)           0         
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 21, 128)           0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 2688)              0         
__________

In [90]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_multicnn', verbose=1, save_best_only=True)

start = time.time()

model3.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model3 = load_model('tweet_sentiment_multicnn')

y_pred = model3.predict(X_test, batch_size=64)
score = model3.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.82521, saving model to tweet_sentiment_multicnn
11s - loss: 0.9132 - acc: 0.5883 - val_loss: 0.8252 - val_acc: 0.6464
Epoch 2/30
Epoch 00001: val_loss improved from 0.82521 to 0.78536, saving model to tweet_sentiment_multicnn
10s - loss: 0.7755 - acc: 0.6527 - val_loss: 0.7854 - val_acc: 0.6520
Epoch 3/30
Epoch 00002: val_loss improved from 0.78536 to 0.76476, saving model to tweet_sentiment_multicnn
11s - loss: 0.7437 - acc: 0.6680 - val_loss: 0.7648 - val_acc: 0.6516
Epoch 4/30
Epoch 00003: val_loss improved from 0.76476 to 0.73620, saving model to tweet_sentiment_multicnn
11s - loss: 0.7107 - acc: 0.7047 - val_loss: 0.7362 - val_acc: 0.7051
Epoch 5/30
Epoch 00004: val_loss improved from 0.73620 to 0.73070, saving model to tweet_sentiment_multicnn
10s - loss: 0.6819 - acc: 0.7295 - val_loss: 0.7307 - val_acc: 0.7002
Epoch 6/30
Epoch 00005: val_loss did not improve
10s - loss: 0.66

### LSTM

In [93]:
embed_dim = 96
lstm_out = 96

model3 = Sequential()
model3.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model3.add(BatchNormalization())
model3.add(Dropout(0.5))
model3.add(LSTM(lstm_out, dropout=0.4, recurrent_dropout=0.3, activation='relu'))
model3.add(Dense(125, activation='relu'))
model3.add(Dropout(0.7))
model3.add(Dense(3, activation='softmax'))
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 46, 96)            480000    
_________________________________________________________________
batch_normalization_9 (Batch (None, 46, 96)            384       
_________________________________________________________________
dropout_25 (Dropout)         (None, 46, 96)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 96)                74112     
_________________________________________________________________
dense_21 (Dense)             (None, 125)               12125     
_________________________________________________________________
dropout_26 (Dropout)         (None, 125)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 3)                 378       
Total para

In [94]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_lstm', verbose=1, save_best_only=True)

start = time.time()

model3.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model3 = load_model('tweet_sentiment_lstm')

y_pred = model3.predict(X_test, batch_size=64)
score = model3.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 1.01317, saving model to tweet_sentiment_lstm
62s - loss: 0.8941 - acc: 0.5929 - val_loss: 1.0132 - val_acc: 0.6162
Epoch 2/30
Epoch 00001: val_loss improved from 1.01317 to 0.73060, saving model to tweet_sentiment_lstm
58s - loss: 0.7513 - acc: 0.6593 - val_loss: 0.7306 - val_acc: 0.6822
Epoch 3/30
Epoch 00002: val_loss improved from 0.73060 to 0.69222, saving model to tweet_sentiment_lstm
64s - loss: 0.6769 - acc: 0.7086 - val_loss: 0.6922 - val_acc: 0.7016
Epoch 4/30
Epoch 00003: val_loss did not improve
54s - loss: 0.6285 - acc: 0.7360 - val_loss: 0.7070 - val_acc: 0.7023
Epoch 5/30
Epoch 00004: val_loss improved from 0.69222 to 0.68446, saving model to tweet_sentiment_lstm
54s - loss: 0.5932 - acc: 0.7538 - val_loss: 0.6845 - val_acc: 0.7051
Epoch 6/30
Epoch 00005: val_loss did not improve
54s - loss: 0.5701 - acc: 0.7644 - val_loss: 0.7052 - val_acc: 0.7068
Epoch 7/30
Epoch 0000

### Pre-trained Model (GloVe-based)

In [None]:
embeddings_index = {}
f = open(os.path.join('glove.twitter.27B', 'glove.twitter.27B.25d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [31]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 34043 unique tokens.


In [None]:
embed_dim = 32
embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector