In [184]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Flatten, Embedding, Dropout, BatchNormalization, Conv1D, MaxPool1D, SpatialDropout1D, LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.regularizers import l2
import re

## Data Exploration

In [185]:
tweets = pd.read_csv('tweets.csv')

In [186]:
tweets.head()

Unnamed: 0,X,tweet_text,directed_at,tweet_sentiment
0,1,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative
1,3,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive
2,5,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive
3,6,@teachntech00 New iPad Apps For #SpeechTherapy...,,neutral
4,7,,,


In [187]:
tweets.shape

(8936, 4)

In [188]:
negative_tweets = pd.read_csv('negative_tweet_supplement.csv')

In [189]:
negative_tweets.head()

Unnamed: 0,X,tweet_text,tweet_sentiment
0,11,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,negative
1,15,@apple Contact sync between Yosemite and iOS8 ...,negative
2,17,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,negative
3,24,"@Apple, For the love of GAWD, CENTER the '1'on...",negative
4,25,i get the storage almost full notification lit...,negative


In [190]:
negative_tweets.shape

(1219, 3)

In [191]:
airline_tweets = pd.read_csv('airline_tweets.csv')
airline_tweets.shape

(11567, 2)

In [192]:
airline_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,@VirginAmerica plus you've added commercials t...,positive
1,@VirginAmerica it's really aggressive to blast...,negative
2,@VirginAmerica and it's a really big bad thing...,negative
3,@VirginAmerica seriously would pay $30 a fligh...,negative
4,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [193]:
debate_tweets = pd.read_csv('gop_debate_tweets.csv')
debate_tweets.shape

(10262, 2)

In [194]:
debate_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,positive


## Data Preprocessing

In [195]:
tweets = tweets.drop('directed_at', axis=1)

In [196]:
all_tweets = tweets.append(negative_tweets)

In [197]:
all_tweets.shape

(10155, 3)

In [198]:
all_tweets = all_tweets.drop('X', axis=1)

In [199]:
all_tweets.shape

(10155, 2)

In [200]:
all_tweets = all_tweets.append(airline_tweets)
all_tweets = all_tweets.append(debate_tweets)

In [201]:
all_tweets.shape

(31984, 2)

In [202]:
def ingest(all_tweets):
    all_tweets = all_tweets[all_tweets['tweet_text'].isnull() == False]
    all_tweets = all_tweets[all_tweets['tweet_sentiment'].isnull() == False]
    return all_tweets

all_tweets = ingest(all_tweets)

In [203]:
all_tweets.shape

(31983, 2)

In [204]:
all_tweets[all_tweets['tweet_sentiment'] == 'negative'].shape, all_tweets[all_tweets['tweet_sentiment'] == 'neutral'].shape, all_tweets[all_tweets['tweet_sentiment'] == 'positive'].shape

((15138, 2), (10397, 2), (6448, 2))

In [205]:
#all_tweets['tweet_text'] = all_tweets['tweet_text'].apply(lambda x: x.lower())
#all_tweets['tweet_text'] = all_tweets['tweet_text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','', x)))

max_features = 6000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(all_tweets['tweet_text'].values)
X = tokenizer.texts_to_sequences(all_tweets['tweet_text'].values)
X = pad_sequences(X)

In [206]:
X.shape

(31983, 50)

In [207]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    6,   34,    4, 1721,
         53,  103,  134,  382,  963,   16, 2639,   84,   23,   32, 1290,
          6,   88,    2,  666,   16,    3], dtype=int32)

## Training Data

In [208]:
Y = pd.get_dummies(all_tweets['tweet_sentiment']).values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [209]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28784, 50), (28784, 3), (3199, 50), (3199, 3))

In [210]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0, 1542,   35,  173, 2306, 3002, 1570,   45,   31, 3002,   20,
        109,    8,  808,  178,    3,   22], dtype=int32)

In [211]:
y_train[0]

array([0, 0, 1], dtype=uint8)

## Three models: Simple Neural Network, CNN, and LSTM

### Single hidden layer NN

In [212]:
embed_dim = 32

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model.add(Dropout(0.7))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.7))
model.add(Dense(3, activation='softmax'))

In [213]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 50, 32)            192000    
_________________________________________________________________
dropout_73 (Dropout)         (None, 50, 32)            0         
_________________________________________________________________
flatten_29 (Flatten)         (None, 1600)              0         
_________________________________________________________________
dense_62 (Dense)             (None, 100)               160100    
_________________________________________________________________
batch_normalization_28 (Batc (None, 100)               400       
_________________________________________________________________
dropout_74 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_63 (Dense)             (None, 3)                 303       
Total para

In [214]:
batch_size = 64

early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_simple', verbose=1, save_best_only=True)

start = time.time()

model.fit(X_train, y_train, batch_size=batch_size, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model = load_model('tweet_sentiment_simple')

y_pred = model.predict(X_test, batch_size=64)
score = model.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.89832, saving model to tweet_sentiment_simple
6s - loss: 1.0496 - acc: 0.5126 - val_loss: 0.8983 - val_acc: 0.6110
Epoch 2/30
Epoch 00001: val_loss improved from 0.89832 to 0.75921, saving model to tweet_sentiment_simple
2s - loss: 0.8369 - acc: 0.6306 - val_loss: 0.7592 - val_acc: 0.6509
Epoch 3/30
Epoch 00002: val_loss improved from 0.75921 to 0.75578, saving model to tweet_sentiment_simple
2s - loss: 0.7683 - acc: 0.6545 - val_loss: 0.7558 - val_acc: 0.6415
Epoch 4/30
Epoch 00003: val_loss improved from 0.75578 to 0.73716, saving model to tweet_sentiment_simple
2s - loss: 0.7314 - acc: 0.6659 - val_loss: 0.7372 - val_acc: 0.6520
Epoch 5/30
Epoch 00004: val_loss did not improve
2s - loss: 0.7099 - acc: 0.6745 - val_loss: 0.7404 - val_acc: 0.6534
Epoch 6/30
Epoch 00005: val_loss did not improve
2s - loss: 0.6931 - acc: 0.6817 - val_loss: 0.7594 - val_acc: 0.6478
Epoch 7/30
Epoch 00

### Single conv layer with max pooling

In [215]:
embed_dim = 128

model2 = Sequential()
model2.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1], embeddings_regularizer=l2(0.001)))
model2.add(SpatialDropout1D(0.55))
model2.add(Conv1D(32, 5, padding='same', activation='relu'))
model2.add(Dropout(0.5))
model2.add(MaxPool1D())
model2.add(Flatten())
model2.add(Dense(100, activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.7))
model2.add(Dense(3, activation='softmax'))

In [216]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_35 (Embedding)     (None, 50, 128)           768000    
_________________________________________________________________
spatial_dropout1d_8 (Spatial (None, 50, 128)           0         
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 50, 32)            20512     
_________________________________________________________________
dropout_75 (Dropout)         (None, 50, 32)            0         
_________________________________________________________________
max_pooling1d_19 (MaxPooling (None, 25, 32)            0         
_________________________________________________________________
flatten_30 (Flatten)         (None, 800)               0         
_________________________________________________________________
dense_64 (Dense)             (None, 100)               80100     
__________

In [217]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_cnn', verbose=1, save_best_only=True)

start = time.time()

model2.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model2 = load_model('tweet_sentiment_cnn')

y_pred = model2.predict(X_test, batch_size=64)
score = model2.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 1.02878, saving model to tweet_sentiment_cnn
17s - loss: 1.1848 - acc: 0.5540 - val_loss: 1.0288 - val_acc: 0.6210
Epoch 2/30
Epoch 00001: val_loss improved from 1.02878 to 0.88474, saving model to tweet_sentiment_cnn
13s - loss: 0.9367 - acc: 0.6456 - val_loss: 0.8847 - val_acc: 0.6554
Epoch 3/30
Epoch 00002: val_loss improved from 0.88474 to 0.87436, saving model to tweet_sentiment_cnn
13s - loss: 0.8777 - acc: 0.6567 - val_loss: 0.8744 - val_acc: 0.6429
Epoch 4/30
Epoch 00003: val_loss improved from 0.87436 to 0.87097, saving model to tweet_sentiment_cnn
13s - loss: 0.8578 - acc: 0.6602 - val_loss: 0.8710 - val_acc: 0.6488
Epoch 5/30
Epoch 00004: val_loss improved from 0.87097 to 0.87083, saving model to tweet_sentiment_cnn
13s - loss: 0.8499 - acc: 0.6645 - val_loss: 0.8708 - val_acc: 0.6575
Epoch 6/30
Epoch 00005: val_loss did not improve
13s - loss: 0.8507 - acc: 0.6640 - val_lo

### LSTM

In [226]:
embed_dim = 64
lstm_out = 96

model3 = Sequential()
model3.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model3.add(SpatialDropout1D(0.7))
model3.add(LSTM(lstm_out, dropout=0.6, recurrent_dropout=0.6))
model3.add(Dense(3, activation='softmax'))
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_40 (Embedding)     (None, 50, 64)            384000    
_________________________________________________________________
spatial_dropout1d_13 (Spatia (None, 50, 64)            0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 96)                61824     
_________________________________________________________________
dense_70 (Dense)             (None, 3)                 291       
Total params: 446,115
Trainable params: 446,115
Non-trainable params: 0
_________________________________________________________________


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_lstm', verbose=1, save_best_only=True)

start = time.time()

model3.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model3 = load_model('tweet_sentiment_lstm')

y_pred = model3.predict(X_test, batch_size=64)
score = model3.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.78014, saving model to tweet_sentiment_lstm
56s - loss: 0.8905 - acc: 0.5965 - val_loss: 0.7801 - val_acc: 0.6440
Epoch 2/30
Epoch 00001: val_loss improved from 0.78014 to 0.74899, saving model to tweet_sentiment_lstm
52s - loss: 0.7761 - acc: 0.6488 - val_loss: 0.7490 - val_acc: 0.6499
Epoch 3/30
Epoch 00002: val_loss improved from 0.74899 to 0.72615, saving model to tweet_sentiment_lstm
52s - loss: 0.7368 - acc: 0.6682 - val_loss: 0.7261 - val_acc: 0.6832
Epoch 4/30
Epoch 00003: val_loss improved from 0.72615 to 0.70754, saving model to tweet_sentiment_lstm
51s - loss: 0.7097 - acc: 0.6841 - val_loss: 0.7075 - val_acc: 0.6978
Epoch 5/30
Epoch 00004: val_loss improved from 0.70754 to 0.68587, saving model to tweet_sentiment_lstm
51s - loss: 0.6816 - acc: 0.7014 - val_loss: 0.6859 - val_acc: 0.7082
Epoch 6/30
Epoch 00005: val_loss improved from 0.68587 to 0.68091, saving model to tw