# Deep Learning for Twitter Data: LSTM network

![](https://hub.packtpub.com/wp-content/uploads/2018/03/Sentiment-Analysis-Tw.png)

## Step 1. Import Data 

In [76]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as pyplot

In [77]:
pd.set_option("display.max_colwidth", 90)
# pd.set_option("display.max_rows", 101)

In [78]:
cols = ["sentiment","id","date","query","user","text"]

In [79]:
training_data = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", header=None, names=cols)
training_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David C..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't..."


In [80]:
test_data = pd.read_csv("testdata.manual.2009.06.14.csv", encoding = "ISO-8859-1",  header=None, names=cols)
test_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fa..."
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs is good read.
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fucking rocks!!!"
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had mine for a few months and never looked ...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2 and I think it's perfect :)


In [81]:
test_data.shape

(498, 6)

In [82]:
training_data.isnull().sum()

sentiment    0
id           0
date         0
query        0
user         0
text         0
dtype: int64

In [83]:
training_data.shape

(1600000, 6)

In [84]:
training_data.set_index("id", inplace=True)

In [85]:
training_data.head()

Unnamed: 0_level_0,sentiment,date,query,user,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1467810369,0,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David C..."
1467810672,0,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result ...
1467810917,0,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
1467811184,0,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
1467811193,0,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't..."


In [86]:
training_data.text[:5]

id
1467810369    @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David C...
1467810672    is upset that he can't update his Facebook by texting it... and might cry as a result ...
1467810917    @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
1467811184                                              my whole body feels itchy and like its on fire 
1467811193    @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't...
Name: text, dtype: object

In [87]:
training_data.drop(["date","query","user"], axis=1, inplace=True)

In [88]:
training_data.head()

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467810369,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David C..."
1467810672,0,is upset that he can't update his Facebook by texting it... and might cry as a result ...
1467810917,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
1467811184,0,my whole body feels itchy and like its on fire
1467811193,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't..."


In [89]:
test_data.drop(["date","query","user"], axis=1, inplace=True)
test_data.set_index("id", inplace=True)
test_data.head()

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,4,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fa..."
4,4,Reading my kindle2... Love it... Lee childs is good read.
5,4,"Ok, first assesment of the #kindle2 ...it fucking rocks!!!"
6,4,@kenburbary You'll love your Kindle2. I've had mine for a few months and never looked ...
7,4,@mikefish Fair enough. But i have the Kindle2 and I think it's perfect :)


In [90]:
all_data = pd.concat([training_data, test_data])
len(all_data)

1600498

In [91]:
all_data.head()

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467810369,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David C..."
1467810672,0,is upset that he can't update his Facebook by texting it... and might cry as a result ...
1467810917,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
1467811184,0,my whole body feels itchy and like its on fire
1467811193,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't..."


#  Clean Data

In [18]:
import re
import string
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akr712/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def clean_tweet(tweet): 
    tweet = str(tweet)
    tweet = BeautifulSoup(tweet, 'lxml')
    tweet = tweet.get_text()
    tweet = re.sub('http\S+\s*', '', tweet)  # remove URLs
    tweet = re.sub('RT|cc', '', tweet)  # remove RT and cc
    tweet = re.sub('#\S+', '', tweet)  # remove hashtags
    tweet = re.sub('@\S+', '', tweet)  # remove mentions
    tweet = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', tweet)  # remove punctuations
    tweet = re.sub('\s+', ' ', tweet)  # remove extra whitespace
    tweet = tweet.split() 
    word_list = []
    for word in tweet:
        word = word.lower()
        if word not in stop_words and word.isalpha():
            word_list.append(word)
    new_tweet = " ".join(word_list)
    return new_tweet

In [92]:
all_data.text = all_data.text.astype(str)

In [93]:
texts = list(all_data.text)

In [94]:
texts[:5]

["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
 "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
 '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
 'my whole body feels itchy and like its on fire ',
 "@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "]

In [95]:
cleaned_tweet = []
for tweet in texts:
    cleaned_tweet.append(clean_tweet(tweet))

In [96]:
cleaned_tweet

['awww thats bummer shoulda got david carr third day',
 'upset cant update facebook texting might cry result school today also blah',
 'dived many times ball managed save rest go bounds',
 'whole body feels itchy like fire',
 'behaving im mad cant see',
 'whole crew',
 'need hug',
 'hey long time see yes rains bit bit lol im fine thanks hows',
 'nope didnt',
 'que muera',
 'spring break plain city snowing',
 'repierced ears',
 'couldnt bear watch thought ua loss embarrassing',
 'counts idk either never talk anymore',
 'wouldve first didnt gun really though zac snyders doucheclown',
 'wish got watch miss premiere',
 'hollis death scene hurt severely watch film wry directors cut',
 'file taxes',
 'ahh ive always wanted see rent love soundtrack',
 'oh dear drinking forgotten table drinks',
 'day didnt get much done',
 'one friend called asked meet mid valley todaybut ive time sigh',
 'baked cake ated',
 'week going hoped',
 'blagh class tomorrow',
 'hate call wake people',
 'going cry sle

In [98]:
all_data["cleaned_tweet"] = cleaned_tweet

In [99]:
all_data.to_csv("cleaned_tweet_data.csv", encoding="utf_8_sig")

In [100]:
all_data.head()

Unnamed: 0_level_0,sentiment,text,cleaned_tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1467810369,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David C...",awww thats bummer shoulda got david carr third day
1467810672,0,is upset that he can't update his Facebook by texting it... and might cry as a result ...,upset cant update facebook texting might cry result school today also blah
1467810917,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,dived many times ball managed save rest go bounds
1467811184,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
1467811193,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't...",behaving im mad cant see


In [101]:
cleaned_tweet

['awww thats bummer shoulda got david carr third day',
 'upset cant update facebook texting might cry result school today also blah',
 'dived many times ball managed save rest go bounds',
 'whole body feels itchy like fire',
 'behaving im mad cant see',
 'whole crew',
 'need hug',
 'hey long time see yes rains bit bit lol im fine thanks hows',
 'nope didnt',
 'que muera',
 'spring break plain city snowing',
 'repierced ears',
 'couldnt bear watch thought ua loss embarrassing',
 'counts idk either never talk anymore',
 'wouldve first didnt gun really though zac snyders doucheclown',
 'wish got watch miss premiere',
 'hollis death scene hurt severely watch film wry directors cut',
 'file taxes',
 'ahh ive always wanted see rent love soundtrack',
 'oh dear drinking forgotten table drinks',
 'day didnt get much done',
 'one friend called asked meet mid valley todaybut ive time sigh',
 'baked cake ated',
 'week going hoped',
 'blagh class tomorrow',
 'hate call wake people',
 'going cry sle

In [102]:
MAX_TWEET_LEN = 0

for tweet in cleaned_tweet:
    tweet = tweet.split()
    if len(tweet) > MAX_TWEET_LEN:
        MAX_TWEET_LEN = len(tweet)

In [103]:
MAX_TWEET_LEN

35

In [108]:
VOCAB_SIZE = len(id_word_dictionary)
VOCAB_SIZE

386140

In [109]:
all_data.to_csv("all_nonnum_tweet_data.csv", encoding="utf_8_sig")

In [110]:
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, GRU, Dense, Bidirectional, Dropout, Reshape, LSTM, 
from sklearn.model_selection import train_test_split

In [111]:
all_data.sentiment.value_counts()

4    800182
0    800177
2       139
Name: sentiment, dtype: int64

In [112]:
binarizer = {4: 1}
all_data.sentiment.replace(binarizer, inplace=True)

In [115]:
all_data.sentiment.unique()

array([0, 1, 2])

In [117]:
all_data.head()

Unnamed: 0_level_0,sentiment,text,cleaned_tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1467810369,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David C...",awww thats bummer shoulda got david carr third day
1467810672,0,is upset that he can't update his Facebook by texting it... and might cry as a result ...,upset cant update facebook texting might cry result school today also blah
1467810917,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,dived many times ball managed save rest go bounds
1467811184,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
1467811193,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't...",behaving im mad cant see


In [137]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [138]:
tokenizer = Tokenizer(100000)
tokenizer.fit_on_texts(all_data.cleaned_tweet)

In [134]:
sequences = tokenizer.texts_to_sequences(all_data.cleaned_tweet)
sequences

[[352, 41, 1074, 3287, 13, 738, 9440, 1703, 3],
 [653, 12, 426, 435, 1863, 188, 410, 2142, 72, 8, 168, 1048],
 [63841, 208, 244, 1193, 1480, 761, 356, 6, 21700],
 [315, 687, 364, 2706, 5, 1008],
 [10015, 1, 473, 12, 21],
 [315, 2016],
 [33, 779],
 [76, 86, 14, 21, 78, 2349, 154, 154, 16, 1, 411, 28, 665],
 [673, 48],
 [2297],
 [1494, 412, 2910, 498, 6163],
 [24939, 1678],
 [297, 1680, 96, 151, 20434, 1431, 5043],
 [4372, 715, 358, 81, 228, 295],
 [2060, 87, 48, 3666, 20, 60, 3993, 81028],
 [46, 13, 96, 32, 3165],
 [42494, 929, 1962, 365, 9922, 96, 861, 30012, 13315, 513],
 [2451, 6443],
 [585, 62, 104, 251, 21, 2375, 10, 3594],
 [30, 504, 622, 2477, 1681, 1334],
 [3, 48, 4, 31, 101],
 [18, 161, 429, 880, 346, 3570, 3424, 8319, 62, 14, 549],
 [3147, 720, 63842],
 [69, 9, 3831],
 [36070, 323, 38],
 [73, 225, 418, 71],
 [9, 410, 49, 64, 3204],
 [1, 44],
 [2967, 16, 8684, 98, 100, 8684, 100, 4, 473],
 [1821, 243, 3174, 6923, 1196, 419, 1159, 294, 14],
 [3281, 630, 1902, 56, 24, 18],
 [25, 

In [135]:
max(max(sequences))

99993

In [140]:
word_num = []
for seq in sequences:
    for token in seq:
        word_num.append(token)
words = set(word_num)

In [141]:
len(words)

99999

In [142]:
VOCAB_SIZE = len(words) + 1

In [143]:
MAX_LEN = 30
paded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding="post", truncating="post")
paded_sequences

array([[  352,    41,  1074, ...,     0,     0,     0],
       [  653,    12,   426, ...,     0,     0,     0],
       [63841,   208,   244, ...,     0,     0,     0],
       ...,
       [  706,    15,   288, ...,     0,     0,     0],
       [ 1019,  2127,    21, ...,     0,     0,     0],
       [  360,   304,   166, ...,     0,     0,     0]], dtype=int32)

In [125]:
all_data["paded_sequences"] = list(paded_sequences)

In [127]:
from keras.utils import to_categorical
labels = to_categorical(all_data.sentiment, num_classes=3)
labels

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

In [144]:
from sklearn.model_selection import train_test_split
partial_x_train, x_val, partial_y_train, y_val = train_test_split(paded_sequences, labels, test_size=0.3)

In [157]:
BATCH_SIZE = 128
NUM_EPOCHS = 5

In [158]:
from keras.layers import SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(VOCAB_SIZE, embed_dim, input_length=MAX_LEN))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 30, 128)           12800000  
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 30, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 591       
Total params: 13,055,391
Trainable params: 13,055,391
Non-trainable params: 0
_________________________________________________________________
None


In [159]:
model.fit(x=partial_x_train, y=partial_y_train, 
          validation_data=(x_val, y_val),
          batch_size=32, 
          epochs=NUM_EPOCHS, 
          verbose=1)

Train on 1120348 samples, validate on 480150 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x206295fd0>