In [1]:
import re
def process_tweet(tweet = ""):
    cleaned = tweet.upper()
    cleaned = re.sub('&\w+;',   '',  cleaned)
    cleaned = re.sub('\'',      '',  cleaned)
    cleaned = re.sub('@\w+ ',   ' ',  cleaned)
    cleaned = re.sub('#\w+ ',   ' ',  cleaned)
    cleaned = re.sub('[^A-Z ]', '',  cleaned)
    cleaned = re.sub('[ ]+',    ' ', cleaned)
    return cleaned.strip()

In [2]:
# Import `data` from file
filename = "training.1600000.processed.noemoticon.csv"
columns  = ['polarity','id','date','query','user','text']

import os, pandas as pd, numpy as np
"""
0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 - the id of the tweet (2087)
2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 - the query (lyx). If there is no query, then this value is NO_QUERY.
4 - the user that tweeted (robotickilldozr)
5 - the text of the tweet (Lyx is cool)
"""


df = pd.read_csv(filename, names=columns)
df = df[['polarity','text']]    # Filter out relevant columns
df = df[df.polarity != 2]       # Do not consider neutral polarity
df = df.sample(frac=1)          # Randomize order
df.text = map(process_tweet, df.text)

In [3]:
from tflearn.data_utils import VocabularyProcessor, to_categorical

max_tweet_length = 40
min_frequency    = 1000 # 100 -> 82.12%, 50 -> 
tweets           = df.text.values
vp               = VocabularyProcessor(max_tweet_length, min_frequency=min_frequency)
vp               = vp.fit(tweets)
tweets_parsed    = vp.transform(tweets)
vocab_size       = len(vp.vocabulary_._mapping.items())


X = [x for x in tweets_parsed]
Y = to_categorical([0 if p == 0 else 1 for p in df.polarity.values], 2)


In [4]:
# Construct the model
import tflearn

net = tflearn.input_data([None, max_tweet_length])
net = tflearn.embedding(net, input_dim=vocab_size, output_dim=256)
net = tflearn.lstm(net, 256, dropout=0.8, return_seq=True)
net = tflearn.lstm(net, 256, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='./tensorboard')

In [None]:
model.fit(X, Y, validation_set=0.1, show_metric=True, batch_size=256)

Training Step: 25638  | total loss: [1m[32m0.40012[0m[0m | time: 322.857s
[2K| Adam | epoch: 005 | loss: 0.40012 - acc: 0.8143 -- iter: 0803328/1440000


In [7]:
# Save so that can be loaded later
vp.save("tweets-%d.vocabulary" % min_frequency)
model.save("tweets-%d.model" % min_frequency)   

INFO:tensorflow:/home/tensorflow/tweets-100.model is not in all_model_checkpoint_paths. Manually adding it.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'list' object has no attribute 'name'
