In [58]:
import time
import pandas as pd
import numpy as np
from string import punctuation
from nltk import text
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, Flatten, Embedding, Dropout, BatchNormalization, Conv1D, MaxPool1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.regularizers import l2

## Data Exploration

In [2]:
tweets = pd.read_csv('tweets.csv')

In [3]:
tweets.head()

Unnamed: 0,X,tweet_text,directed_at,tweet_sentiment
0,1,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative
1,3,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive
2,5,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive
3,6,@teachntech00 New iPad Apps For #SpeechTherapy...,,neutral
4,7,,,


In [4]:
tweets.shape

(8936, 4)

In [5]:
negative_tweets = pd.read_csv('negative_tweet_supplement.csv')

In [6]:
negative_tweets.head()

Unnamed: 0,X,tweet_text,tweet_sentiment
0,11,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,negative
1,15,@apple Contact sync between Yosemite and iOS8 ...,negative
2,17,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,negative
3,24,"@Apple, For the love of GAWD, CENTER the '1'on...",negative
4,25,i get the storage almost full notification lit...,negative


In [7]:
negative_tweets.shape

(1219, 3)

In [8]:
airline_tweets = pd.read_csv('airline_tweets.csv')
airline_tweets.shape

(11567, 2)

In [9]:
airline_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,@VirginAmerica plus you've added commercials t...,positive
1,@VirginAmerica it's really aggressive to blast...,negative
2,@VirginAmerica and it's a really big bad thing...,negative
3,@VirginAmerica seriously would pay $30 a fligh...,negative
4,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [10]:
debate_tweets = pd.read_csv('gop_debate_tweets.csv')
debate_tweets.shape

(10262, 2)

In [11]:
debate_tweets.head()

Unnamed: 0,tweet_text,tweet_sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,positive


## Data Preprocessing

In [12]:
tweets = tweets.drop('directed_at', axis=1)

In [13]:
all_tweets = tweets.append(negative_tweets)

In [14]:
all_tweets.shape

(10155, 3)

In [15]:
all_tweets = all_tweets.drop('X', axis=1)

In [16]:
all_tweets.shape

(10155, 2)

In [17]:
all_tweets = all_tweets.append(airline_tweets)
all_tweets = all_tweets.append(debate_tweets)

In [18]:
all_tweets.shape

(31984, 2)

In [19]:
def ingest(all_tweets):
    all_tweets = all_tweets[all_tweets['tweet_text'].isnull() == False]
    all_tweets = all_tweets[all_tweets['tweet_sentiment'].isnull() == False]
    all_tweets.reset_index(inplace=True)
    return all_tweets

all_tweets = ingest(all_tweets)

In [20]:
punctuations = list(punctuation)

def tokenize(tweet):
    try:
        tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: t not in punctuations, tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return list(tokens)
    except:
        return 'NC'

def postprocess(data):
    data['tokens'] = list(map(lambda x: tokenize(x), data['tweet_text']))
    return data

In [21]:
all_tweets = postprocess(all_tweets)

In [22]:
all_tweets.head()

Unnamed: 0,index,tweet_text,tweet_sentiment,tokens
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,negative,"[i, have, a, 3g, iphone, after, 3, hrs, tweeti..."
1,1,@swonderlin Can not wait for #iPad 2 also. The...,positive,"[can, not, wait, for, 2, also, they, should, s..."
2,2,@sxtxstate great stuff on Fri #SXSW: Marissa M...,positive,"[great, stuff, on, fri, marissa, mayer, google..."
3,3,@teachntech00 New iPad Apps For #SpeechTherapy...,neutral,"[new, ipad, apps, for, and, communication, are..."
4,5,"#SXSW is just starting, #CTIA is around the co...",positive,"[is, just, starting, is, around, the, corner, ..."


In [24]:
all_tokens = [item for sublist in all_tweets.tokens.values for item in sublist]

In [25]:
len(all_tokens)

490727

In [26]:
len(set(all_tokens))

22546

In [None]:
t = text.Text(set(all_tokens))

In [None]:
t.index('hostage')

In [None]:
len(t)

In [27]:
token_dict = {k: v for v, k in enumerate(set(all_tokens))}

In [29]:
fd = FreqDist(all_tokens)
len(fd)

22546

In [30]:
fd.most_common(20)

[('the', 16134),
 ('to', 13972),
 ('a', 8821),
 ('rt', 7918),
 ('i', 7620),
 ('for', 7265),
 ('and', 6608),
 ('is', 6251),
 ('of', 5884),
 ('on', 5860),
 ('you', 5734),
 ('in', 5690),
 ('at', 5020),
 ('my', 4341),
 ('link', 4320),
 ('it', 3436),
 ('flight', 3085),
 ('that', 2815),
 ('with', 2796),
 ('this', 2748)]

In [31]:
list(map(lambda x: token_dict.get(x), all_tweets.tokens[0]))

[2806,
 18537,
 6855,
 15532,
 22145,
 4917,
 1287,
 22115,
 1427,
 6436,
 6691,
 7600,
 8261,
 2806,
 20873,
 5505,
 169,
 1452,
 11903,
 6436]

In [34]:
list(token_dict.keys())[list(token_dict.values()).index(2806)]

'i'

In [35]:
def get_word_indices(tokens):
    idxs = []
    for t in tokens:
        idxs.append(token_dict.get(t))
    return idxs

get_word_indices(all_tweets.tokens[0])

[2806,
 18537,
 6855,
 15532,
 22145,
 4917,
 1287,
 22115,
 1427,
 6436,
 6691,
 7600,
 8261,
 2806,
 20873,
 5505,
 169,
 1452,
 11903,
 6436]

In [36]:
all_tweets['indexed_text'] = list(map(lambda x: get_word_indices(x), all_tweets.tokens))

In [37]:
all_tweets.head()

Unnamed: 0,index,tweet_text,tweet_sentiment,tokens,indexed_text
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,negative,"[i, have, a, 3g, iphone, after, 3, hrs, tweeti...","[2806, 18537, 6855, 15532, 22145, 4917, 1287, ..."
1,1,@swonderlin Can not wait for #iPad 2 also. The...,positive,"[can, not, wait, for, 2, also, they, should, s...","[3720, 6192, 7632, 17459, 21173, 21235, 11543,..."
2,2,@sxtxstate great stuff on Fri #SXSW: Marissa M...,positive,"[great, stuff, on, fri, marissa, mayer, google...","[9490, 8855, 2358, 6083, 4351, 2227, 4547, 207..."
3,3,@teachntech00 New iPad Apps For #SpeechTherapy...,neutral,"[new, ipad, apps, for, and, communication, are...","[7394, 15164, 6661, 17459, 19353, 19467, 1435,..."
4,5,"#SXSW is just starting, #CTIA is around the co...",positive,"[is, just, starting, is, around, the, corner, ...","[12997, 5224, 10116, 12997, 11240, 21619, 5707..."


In [38]:
all_tweets.drop(['index', 'tweet_text', 'tokens'], axis=1, inplace=True)

## Training Data

In [39]:
X_train, X_test, y_train, y_test = train_test_split(np.asarray(all_tweets.indexed_text), 
                                                    np.asarray(all_tweets.tweet_sentiment),
                                                    test_size = 0.1)

In [40]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28784,), (28784,), (3199,), (3199,))

In [41]:
X_train[0]

[19561,
 6691,
 11171,
 3397,
 10388,
 1285,
 2806,
 22425,
 6855,
 21440,
 2358,
 21619,
 2122,
 18322,
 20656,
 16860]

In [42]:
y_train[0]

'positive'

In [80]:
X_train = sequence.pad_sequences(X_train, maxlen=120, value=0)
X_test = sequence.pad_sequences(X_test, maxlen=120, value=0)

In [44]:
le = LabelEncoder()
le.fit(['negative', 'neutral', 'positive'])

y_train = le.transform(y_train)
y_test = le.transform(y_test)

y_train = to_categorical(y_train, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

In [45]:
y_train.shape

(28784, 3)

## Create simple models
### Single hidden layer NN
The simplest model that tends to give reasonable results is a single hidden layer net. So let's try that. Note that we can't expect to get any useful results by feeding word ids directly into a neural net - so instead we use an embedding to replace them with a vector of 32 (initially random) floats for each word in the vocab.

In [213]:
model = Sequential()
model.add(Embedding(X_train.shape[0], 32, input_length=120, embeddings_regularizer=l2(0.002)))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(200, kernel_regularizer=l2(0.001), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.65))
model.add(Dense(3, activation='softmax'))

In [214]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_56 (Embedding)     (None, 120, 32)           921088    
_________________________________________________________________
dropout_155 (Dropout)        (None, 120, 32)           0         
_________________________________________________________________
batch_normalization_18 (Batc (None, 120, 32)           128       
_________________________________________________________________
flatten_56 (Flatten)         (None, 3840)              0         
_________________________________________________________________
dense_111 (Dense)            (None, 200)               768200    
_________________________________________________________________
batch_normalization_19 (Batc (None, 200)               800       
_________________________________________________________________
dropout_156 (Dropout)        (None, 200)               0         
__________

In [215]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_simple', verbose=1, save_best_only=True)

start = time.time()

model.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=10, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model = load_model('tweet_sentiment_simple')

y_pred = model.predict(X_test, batch_size=64)
score = model.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/10
Epoch 00000: val_loss improved from inf to 1.45085, saving model to tweet_sentiment_simple
18s - loss: 1.8450 - acc: 0.5126 - val_loss: 1.4508 - val_acc: 0.6110
Epoch 2/10
Epoch 00001: val_loss improved from 1.45085 to 1.16064, saving model to tweet_sentiment_simple
12s - loss: 1.2465 - acc: 0.6627 - val_loss: 1.1606 - val_acc: 0.6818
Epoch 3/10
Epoch 00002: val_loss improved from 1.16064 to 1.08555, saving model to tweet_sentiment_simple
12s - loss: 1.0968 - acc: 0.7167 - val_loss: 1.0855 - val_acc: 0.7065
Epoch 4/10
Epoch 00003: val_loss improved from 1.08555 to 1.06279, saving model to tweet_sentiment_simple
12s - loss: 1.0089 - acc: 0.7454 - val_loss: 1.0628 - val_acc: 0.7068
Epoch 5/10
Epoch 00004: val_loss did not improve
12s - loss: 0.9619 - acc: 0.7616 - val_loss: 1.0876 - val_acc: 0.6989
Epoch 6/10
Epoch 00005: val_loss did not improve
12s - loss: 0.9337 - acc: 0.7746 - val_loss: 1.0712 - val_acc: 0.7093
Epoch 7/10
Ep

### Single conv layer with max pooling
A CNN is likely to work better, since it's designed to take advantage of ordered data. We'll need to use a 1D CNN, since a sequence of words is 1D.

In [182]:
model2 = Sequential()
model2.add(Embedding(X_train.shape[0], 32, input_length=120, embeddings_regularizer=l2(0.003)))
model2.add(Dropout(0.5))
model2.add(Conv1D(40, 5, padding='same', activation='relu'))
model2.add(Dropout(0.5))
model2.add(MaxPool1D())
model2.add(Flatten())
model2.add(Dense(225, kernel_regularizer=l2(0.002), activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.7))
model2.add(Dense(3, activation='softmax'))

In [183]:
model2.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_45 (Embedding)     (None, 120, 32)           921088    
_________________________________________________________________
dropout_132 (Dropout)        (None, 120, 32)           0         
_________________________________________________________________
conv1d_44 (Conv1D)           (None, 120, 40)           6440      
_________________________________________________________________
dropout_133 (Dropout)        (None, 120, 40)           0         
_________________________________________________________________
max_pooling1d_44 (MaxPooling (None, 60, 40)            0         
_________________________________________________________________
flatten_45 (Flatten)         (None, 2400)              0         
_________________________________________________________________
dense_89 (Dense)             (None, 225)               540225    
__________

In [184]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='tweet_sentiment_cnn', verbose=1, save_best_only=True)

start = time.time()

model2.fit(X_train, y_train, batch_size=64, validation_split=0.1, epochs=30, verbose=2, 
          callbacks=[early_stopping, model_checkpoint])

end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model2 = load_model('tweet_sentiment_cnn')

y_pred = model2.predict(X_test, batch_size=64)
score = model2.evaluate(X_test, y_test, verbose=1)

print(score)

Train on 25905 samples, validate on 2879 samples
Epoch 1/30
Epoch 00000: val_loss improved from inf to 1.61902, saving model to tweet_sentiment_cnn
18s - loss: 2.4082 - acc: 0.4544 - val_loss: 1.6190 - val_acc: 0.4658
Epoch 2/30
Epoch 00001: val_loss improved from 1.61902 to 1.15735, saving model to tweet_sentiment_cnn
14s - loss: 1.3105 - acc: 0.5572 - val_loss: 1.1573 - val_acc: 0.5662
Epoch 3/30
Epoch 00002: val_loss improved from 1.15735 to 1.05563, saving model to tweet_sentiment_cnn
14s - loss: 1.0284 - acc: 0.6023 - val_loss: 1.0556 - val_acc: 0.5384
Epoch 4/30
Epoch 00003: val_loss improved from 1.05563 to 0.92162, saving model to tweet_sentiment_cnn
15s - loss: 0.9268 - acc: 0.6233 - val_loss: 0.9216 - val_acc: 0.6356
Epoch 5/30
Epoch 00004: val_loss improved from 0.92162 to 0.85454, saving model to tweet_sentiment_cnn
14s - loss: 0.8704 - acc: 0.6349 - val_loss: 0.8545 - val_acc: 0.6398
Epoch 6/30
Epoch 00005: val_loss improved from 0.85454 to 0.84533, saving model to tweet_s