In [1]:
import time
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Flatten, Embedding, Dropout, BatchNormalization, Conv1D, MaxPool1D, SpatialDropout1D, LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.optimizers import Adam
from nltk.corpus import stopwords

Using TensorFlow backend.


## Data Exploration

In [2]:
tweets = pd.read_csv('tweets.csv')

In [3]:
tweets.head()

Unnamed: 0,X,tweet_text,directed_at,tweet_sentiment
0,1,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative
1,3,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive
2,5,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive
3,6,@teachntech00 New iPad Apps For #SpeechTherapy...,,neutral
4,7,,,


In [4]:
tweets.shape

(8936, 4)

In [5]:
negative_tweets = pd.read_csv('negative_tweet_supplement.csv')

In [6]:
negative_tweets.head()

Unnamed: 0,X,tweet_text,tweet_sentiment
0,11,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,negative
1,15,@apple Contact sync between Yosemite and iOS8 ...,negative
2,17,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,negative
3,24,"@Apple, For the love of GAWD, CENTER the '1'on...",negative
4,25,i get the storage almost full notification lit...,negative


In [7]:
negative_tweets.shape

(1219, 3)

In [8]:
airline_tweets = pd.read_csv('airline_tweets.csv')
airline_tweets.shape

(11567, 2)

In [9]:
airline_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,@VirginAmerica plus you've added commercials t...,positive
1,@VirginAmerica it's really aggressive to blast...,negative
2,@VirginAmerica and it's a really big bad thing...,negative
3,@VirginAmerica seriously would pay $30 a fligh...,negative
4,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [10]:
debate_tweets = pd.read_csv('gop_debate_tweets.csv')
debate_tweets.shape

(10262, 2)

In [11]:
debate_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,positive


## Data Preprocessing

In [12]:
tweets = tweets.drop('directed_at', axis=1)

In [13]:
all_tweets = tweets.append(negative_tweets)

In [14]:
all_tweets.shape

(10155, 3)

In [15]:
all_tweets = all_tweets.drop('X', axis=1)

In [16]:
all_tweets.shape

(10155, 2)

In [17]:
all_tweets = all_tweets.append(airline_tweets)
all_tweets = all_tweets.append(debate_tweets)

In [18]:
all_tweets.shape

(31984, 2)

In [19]:
def ingest(all_tweets):
    all_tweets = all_tweets[all_tweets['tweet_text'].isnull() == False]
    all_tweets = all_tweets[all_tweets['tweet_sentiment'].isnull() == False]
    return all_tweets

all_tweets = ingest(all_tweets)

In [20]:
all_tweets.shape

(31983, 2)

In [21]:
all_tweets[all_tweets['tweet_sentiment'] == 'negative'].shape, all_tweets[all_tweets['tweet_sentiment'] == 'neutral'].shape, all_tweets[all_tweets['tweet_sentiment'] == 'positive'].shape

((15138, 2), (10397, 2), (6448, 2))

In [29]:
def filter_stopwords(tweets):
    sw = stopwords.words('english')
    processed_tweets = []
    for tweet in tweets:
        text = ' '.join([word for word in tweet.split() if word not in sw])
        processed_tweets.append(text)
    
    return processed_tweets

[ '.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'
 '@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.'
 "@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)"
 '@teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd'
 '#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan']

['.@wesley83 I 3G iPhone. After 3 hrs tweeting #RISE_Austin, dead! I need upgrade. Plugin stations #SXSW.', '@swonderlin Can wait #iPad 2 also. They sale #SXSW.', "@sxtxstate great stuff Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)", '@teachntech00 New iPad Apps For #SpeechTherapy And Communication A

In [79]:
#all_tweets['tweet_text'] = all_tweets['tweet_text'].apply(lambda x: x.lower())
#all_tweets['tweet_text'] = all_tweets['tweet_text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','', x)))

max_features = 2500
tokenizer = Tokenizer(num_words=max_features, split=' ')
tweets = filter_stopwords(all_tweets['tweet_text'].values)
tokenizer.fit_on_texts(tweets)

X = tokenizer.texts_to_sequences(tweets)
X = pad_sequences(X)

In [80]:
X.shape

(31983, 50)

In [81]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    3, 1702,   23,  480,   79,  330,  935,
         39, 1268,    3,   41,  628,    1], dtype=int32)

## Training Data

In [82]:
Y = pd.get_dummies(all_tweets['tweet_sentiment']).values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [83]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28784, 50), (28784, 3), (3199, 50), (3199, 3))

In [84]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 1521, 2289, 1549, 1003,   14,   19,    8,
         57,    4,  770,  124,    1,   10], dtype=int32)

In [85]:
y_train[0]

array([0, 0, 1], dtype=uint8)

## Three models: Simple Neural Network, CNN, and LSTM

### Single hidden layer NN

In [86]:
embed_dim = 32

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model.add(Dropout(0.7))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.7))
model.add(Dense(3, activation='softmax'))

In [87]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 50, 32)            80000     
_________________________________________________________________
dropout_26 (Dropout)         (None, 50, 32)            0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_21 (Dense)             (None, 100)               160100    
_________________________________________________________________
batch_normalization_8 (Batch (None, 100)               400       
_________________________________________________________________
dropout_27 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 3)                 303       
Total para

In [88]:
batch_size = 64

early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_simple', verbose=1, save_best_only=True)

start = time.time()

model.fit(X_train, y_train, batch_size=batch_size, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model = load_model('tweet_sentiment_simple')

y_pred = model.predict(X_test, batch_size=64)
score = model.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.88193, saving model to tweet_sentiment_simple
3s - loss: 1.0199 - acc: 0.5345 - val_loss: 0.8819 - val_acc: 0.6342
Epoch 2/30
Epoch 00001: val_loss improved from 0.88193 to 0.76490, saving model to tweet_sentiment_simple
1s - loss: 0.8221 - acc: 0.6332 - val_loss: 0.7649 - val_acc: 0.6447
Epoch 3/30
Epoch 00002: val_loss improved from 0.76490 to 0.74209, saving model to tweet_sentiment_simple
1s - loss: 0.7748 - acc: 0.6456 - val_loss: 0.7421 - val_acc: 0.6468
Epoch 4/30
Epoch 00003: val_loss improved from 0.74209 to 0.72918, saving model to tweet_sentiment_simple
1s - loss: 0.7502 - acc: 0.6597 - val_loss: 0.7292 - val_acc: 0.6745
Epoch 5/30
Epoch 00004: val_loss improved from 0.72918 to 0.70953, saving model to tweet_sentiment_simple
1s - loss: 0.7220 - acc: 0.6816 - val_loss: 0.7095 - val_acc: 0.6891
Epoch 6/30
Epoch 00005: val_loss improved from 0.70953 to 0.70093, saving model 

### Single conv layer with max pooling

In [89]:
embed_dim = 128

model2 = Sequential()
model2.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model2.add(SpatialDropout1D(0.3))
model2.add(Dropout(0.6))
model2.add(Conv1D(32, 5, padding='same', activation='relu'))
model2.add(Dropout(0.5))
model2.add(MaxPool1D(5))
model2.add(Flatten())
model2.add(Dense(100, activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.7))
model2.add(Dense(3, activation='softmax'))

In [90]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 50, 128)           320000    
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 50, 128)           0         
_________________________________________________________________
dropout_28 (Dropout)         (None, 50, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 50, 32)            20512     
_________________________________________________________________
dropout_29 (Dropout)         (None, 50, 32)            0         
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 10, 32)            0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 320)               0         
__________

In [91]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_cnn', verbose=1, save_best_only=True)

start = time.time()

model2.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model2 = load_model('tweet_sentiment_cnn')

y_pred = model2.predict(X_test, batch_size=64)
score = model2.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.87728, saving model to tweet_sentiment_cnn
9s - loss: 1.1141 - acc: 0.5120 - val_loss: 0.8773 - val_acc: 0.6266
Epoch 2/30
Epoch 00001: val_loss improved from 0.87728 to 0.79618, saving model to tweet_sentiment_cnn
8s - loss: 0.8079 - acc: 0.6344 - val_loss: 0.7962 - val_acc: 0.6499
Epoch 3/30
Epoch 00002: val_loss improved from 0.79618 to 0.78655, saving model to tweet_sentiment_cnn
10s - loss: 0.7644 - acc: 0.6486 - val_loss: 0.7865 - val_acc: 0.6450
Epoch 4/30
Epoch 00003: val_loss improved from 0.78655 to 0.77921, saving model to tweet_sentiment_cnn
8s - loss: 0.7441 - acc: 0.6540 - val_loss: 0.7792 - val_acc: 0.6464
Epoch 5/30
Epoch 00004: val_loss improved from 0.77921 to 0.75011, saving model to tweet_sentiment_cnn
8s - loss: 0.7207 - acc: 0.6710 - val_loss: 0.7501 - val_acc: 0.6714
Epoch 6/30
Epoch 00005: val_loss improved from 0.75011 to 0.73283, saving model to tweet_senti

### Multi-layer CNN

In [92]:
embed_dim = 32

model3 = Sequential()
model3.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1], embeddings_regularizer=l2(0.0001)))
model3.add(SpatialDropout1D(0.2))
model3.add(Dropout(0.6))

model3.add(Conv1D(128, 5, activation='relu'))
model3.add(Dropout(0.6))
model3.add(MaxPool1D())

model3.add(Flatten())

model3.add(Dense(256, activation='relu', kernel_regularizer=l2(0.0001)))
model3.add(Dropout(0.7))
model3.add(Dense(3, activation='softmax'))

In [93]:
#model3.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001, decay=1e-7), metrics=['accuracy'])
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 50, 32)            80000     
_________________________________________________________________
spatial_dropout1d_7 (Spatial (None, 50, 32)            0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 50, 32)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 46, 128)           20608     
_________________________________________________________________
dropout_32 (Dropout)         (None, 46, 128)           0         
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 23, 128)           0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 2944)              0         
__________

In [94]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_multicnn', verbose=1, save_best_only=True)

start = time.time()

model3.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model3 = load_model('tweet_sentiment_multicnn')

y_pred = model3.predict(X_test, batch_size=64)
score = model3.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.81498, saving model to tweet_sentiment_multicnn
11s - loss: 0.9122 - acc: 0.5905 - val_loss: 0.8150 - val_acc: 0.6495
Epoch 2/30
Epoch 00001: val_loss improved from 0.81498 to 0.78907, saving model to tweet_sentiment_multicnn
10s - loss: 0.7825 - acc: 0.6514 - val_loss: 0.7891 - val_acc: 0.6464
Epoch 3/30
Epoch 00002: val_loss improved from 0.78907 to 0.77503, saving model to tweet_sentiment_multicnn
10s - loss: 0.7587 - acc: 0.6615 - val_loss: 0.7750 - val_acc: 0.6471
Epoch 4/30
Epoch 00003: val_loss improved from 0.77503 to 0.74506, saving model to tweet_sentiment_multicnn
10s - loss: 0.7345 - acc: 0.6883 - val_loss: 0.7451 - val_acc: 0.6888
Epoch 5/30
Epoch 00004: val_loss improved from 0.74506 to 0.73621, saving model to tweet_sentiment_multicnn
10s - loss: 0.7121 - acc: 0.7102 - val_loss: 0.7362 - val_acc: 0.6950
Epoch 6/30
Epoch 00005: val_loss improved from 0.73621 to 0.72693

### LSTM

In [95]:
embed_dim = 96
lstm_out = 96

model3 = Sequential()
model3.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model3.add(BatchNormalization())
model3.add(Dropout(0.5))
model3.add(LSTM(lstm_out, dropout=0.4, recurrent_dropout=0.3, activation='relu'))
model3.add(Dense(125, activation='relu'))
model3.add(Dropout(0.7))
model3.add(Dense(3, activation='softmax'))
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 50, 96)            240000    
_________________________________________________________________
batch_normalization_10 (Batc (None, 50, 96)            384       
_________________________________________________________________
dropout_34 (Dropout)         (None, 50, 96)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 96)                74112     
_________________________________________________________________
dense_27 (Dense)             (None, 125)               12125     
_________________________________________________________________
dropout_35 (Dropout)         (None, 125)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 3)                 378       
Total para

In [78]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_lstm', verbose=1, save_best_only=True)

start = time.time()

model3.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model3 = load_model('tweet_sentiment_lstm')

y_pred = model3.predict(X_test, batch_size=64)
score = model3.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 1.01078, saving model to tweet_sentiment_lstm
49s - loss: 0.9024 - acc: 0.5924 - val_loss: 1.0108 - val_acc: 0.6096
Epoch 2/30
Epoch 00001: val_loss improved from 1.01078 to 0.75497, saving model to tweet_sentiment_lstm
47s - loss: 0.7893 - acc: 0.6416 - val_loss: 0.7550 - val_acc: 0.6659
Epoch 3/30
Epoch 00002: val_loss improved from 0.75497 to 0.72229, saving model to tweet_sentiment_lstm
50s - loss: 0.7454 - acc: 0.6753 - val_loss: 0.7223 - val_acc: 0.6732
Epoch 4/30
Epoch 00003: val_loss improved from 0.72229 to 0.70923, saving model to tweet_sentiment_lstm
49s - loss: 0.7267 - acc: 0.6884 - val_loss: 0.7092 - val_acc: 0.6787
Epoch 5/30
Epoch 00004: val_loss improved from 0.70923 to 0.69887, saving model to tweet_sentiment_lstm
47s - loss: 0.7094 - acc: 0.6970 - val_loss: 0.6989 - val_acc: 0.6801
Epoch 6/30
Epoch 00005: val_loss improved from 0.69887 to 0.69239, saving model to tw

### Pre-trained Model (GloVe-based)

In [None]:
embeddings_index = {}
f = open(os.path.join('glove.twitter.27B', 'glove.twitter.27B.25d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [31]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 34043 unique tokens.


In [None]:
embed_dim = 32
embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector