In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.simplefilter("ignore", UserWarning)
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

from keras.models import Model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Dense, Embedding, MaxPooling1D
from keras.layers import SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.utils.vis_utils import plot_model

Using TensorFlow backend.


In [2]:
data = pd.read_csv('data/twitter_sentiment_dataset.csv', encoding='latin1', usecols=['Sentiment', 'SentimentText'])
data.columns = ['sentiment', 'text']

In [3]:
data.head()

Unnamed: 0,sentiment,text
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [4]:
print(data.shape)

(1578614, 2)


In [5]:
def clean_text(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

In [6]:
data['text'] = data['text'].map(clean_text)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], 
                                                    data['sentiment'], 
                                                    test_size=0.1, 
                                                    random_state=42,
                                                    stratify=data['sentiment'])

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1420752,) (157862,) (1420752,) (157862,)


In [8]:
MAX_WORDS = 100000
tokenizer = Tokenizer(num_words=MAX_WORDS)

tokenizer.fit_on_texts(data['text'])

In [9]:
X_train[15]

'lt This is the way i feel right now'

In [10]:
tokenizer.texts_to_sequences([X_train[15]])

[[159, 28, 9, 3, 131, 1, 110, 117, 29]]

In [11]:
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [12]:
MAX_LENGTH = 35
padded_train_sequences = pad_sequences(train_sequences, maxlen=MAX_LENGTH)
padded_test_sequences = pad_sequences(test_sequences, maxlen=MAX_LENGTH)

In [13]:
padded_train_sequences

array([[    0,     0,     0, ...,   162,   356,   224],
       [    0,     0,     0, ...,   879,  1656,   661],
       [    0,     0,     0, ...,     0,   153,  6543],
       ...,
       [    0,     0,     0, ...,  1504,  1469, 26172],
       [    0,     0,     0, ...,    55,    94,   433],
       [    0,     0,     0, ...,   193,    13,     6]], dtype=int32)

In [14]:
padded_train_sequences.shape

(1420752, 35)

In [15]:
def get_simple_rnn_model():
    embedding_dim = 300
    embedding_matrix = np.random.random((MAX_WORDS, embedding_dim))
    
    inp = Input(shape=(MAX_LENGTH, ))
    x = Embedding(input_dim=MAX_WORDS, output_dim=embedding_dim, input_length=MAX_LENGTH, 
                  weights=[embedding_matrix], trainable=True)(inp)
    x = SpatialDropout1D(rate=0.3)(x)
    x = Bidirectional(GRU(units=100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

rnn_simple_model = get_simple_rnn_model()

In [16]:
import pydotplus as pydot

In [17]:
pydot.find_graphviz()

{'dot': '/home/paperspace/anaconda3/envs/myconda/bin/dot',
 'twopi': '/home/paperspace/anaconda3/envs/myconda/bin/twopi',
 'neato': '/home/paperspace/anaconda3/envs/myconda/bin/neato',
 'circo': '/home/paperspace/anaconda3/envs/myconda/bin/circo',
 'fdp': '/home/paperspace/anaconda3/envs/myconda/bin/fdp',
 'sfdp': '/home/paperspace/anaconda3/envs/myconda/bin/sfdp'}

In [18]:
plot_model(rnn_simple_model, to_file='rnn.png', show_shapes=True, show_layer_names=True)

![rnn.png](rnn.png)

In [19]:
filepath="./models/weights-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

batch_size = 256
epochs = 2

history = rnn_simple_model.fit(x=padded_train_sequences, 
                    y=y_train, 
                    validation_data=(padded_test_sequences, y_test), 
                    batch_size=batch_size, 
                    callbacks=[checkpoint], 
                    epochs=epochs, 
                    verbose=1)

Train on 1420752 samples, validate on 157862 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.81921, saving model to ./models/weights-improvement-01-0.8192.hdf5
Epoch 2/2

Epoch 00002: val_acc improved from 0.81921 to 0.82729, saving model to ./models/weights-improvement-02-0.8273.hdf5


In [20]:
best_rnn_simple_model = load_model('./models/weights-improvement-02-{:0.4f}.hdf5'.format(checkpoint.best))

y_pred_rnn_simple = best_rnn_simple_model.predict(padded_test_sequences, verbose=1, batch_size=2048)

y_pred_rnn_simple = pd.DataFrame(y_pred_rnn_simple, columns=['prediction'])
y_pred_rnn_simple['prediction'] = y_pred_rnn_simple['prediction'].map(lambda p: 1 if p >= 0.5 else 0)



In [21]:
def printClassificationErrors(y_test, y_pred):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    print('ROC AUC score: {}'.format(roc_auc_score(y_test, y_pred)))
    print('Accuracy Score: {}'.format(accuracy_score(y_test, y_pred)))

In [22]:
printClassificationErrors(y_test, y_pred_rnn_simple['prediction'])

Confusion Matrix:
[[65068 13776]
 [13488 65530]]
Classification Report:
             precision    recall  f1-score   support

          0       0.83      0.83      0.83     78844
          1       0.83      0.83      0.83     79018

avg / total       0.83      0.83      0.83    157862

ROC AUC score: 0.8272899712059488
Accuracy Score: 0.8272921919144569


In [23]:
from Modules.tweepy_streaming import saveTweepyTweets
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
import config

In [33]:
tweepy_listener = saveTweepyTweets(time_limit=180, 
                                   num_of_tweets=20, 
                                   save_file='twitter_stream_data.json', 
                                   retweets=False)
auth = OAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET)
auth.set_access_token(config.ACCESS_TOKEN, config.ACCESS_TOKEN_SECRET)
stream = Stream(auth=auth, listener=tweepy_listener)

In [34]:
stream.filter(track=['anime'], languages=['en'])

Getting tweet #1...
Getting tweet #2...
Getting tweet #3...
Getting tweet #4...
Getting tweet #5...
Getting tweet #6...
Getting tweet #7...
Getting tweet #8...
Getting tweet #9...
Getting tweet #10...
Getting tweet #11...
Getting tweet #12...
Getting tweet #13...
Getting tweet #14...
Getting tweet #15...
Getting tweet #16...
Getting tweet #17...
Getting tweet #18...
Getting tweet #19...
Getting tweet #20...
Completed collection of tweets.


In [35]:
def tweetsToDataFrame(json_file):
    data = []
    with open(json_file, 'r') as json_data:
        for line in json_data:
            tweet = json.loads(line) # load it as Python dict
            data.append(tweet)
    return pd.DataFrame(data)

tweet_df = tweetsToDataFrame('twitter_stream_data.json')
tweet_df.head()

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,extended_tweet,favorite_count,favorited,filter_level,...,quoted_status_id_str,quoted_status_permalink,reply_count,retweet_count,retweeted,source,text,timestamp_ms,truncated,user
0,,,Sun Jun 24 12:40:34 +0000 2018,"[0, 23]","{'hashtags': [], 'urls': [{'url': 'https://t.c...",,,0,False,low,...,1.010545086218883e+18,"{'url': 'https://t.co/uqoW5Yi3E0', 'expanded':...",0,0,False,"<a href=""http://twitter.com/download/iphone"" r...","an art, an intellectual https://t.co/uqoW5Yi3E0",1529844034433,False,"{'id': 2375267126, 'id_str': '2375267126', 'na..."
1,,,Sun Jun 24 12:40:48 +0000 2018,"[14, 140]","{'hashtags': [], 'urls': [{'url': 'https://t.c...",,{'full_text': '@ghostiesquid The statement as ...,0,False,low,...,,,0,0,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",@ghostiesquid The statement as a whole is sema...,1529844048379,True,"{'id': 591442372, 'id_str': '591442372', 'name..."
2,,,Sun Jun 24 12:40:53 +0000 2018,"[15, 140]","{'hashtags': [], 'urls': [{'url': 'https://t.c...",,{'full_text': '@dear_hoya1991 yes i did too wh...,0,False,low,...,,,0,0,False,"<a href=""http://twitter.com/download/android"" ...",@dear_hoya1991 yes i did too when i first star...,1529844053032,True,"{'id': 966739678220181505, 'id_str': '96673967..."
3,,,Sun Jun 24 12:40:58 +0000 2018,,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",,,0,False,low,...,,,0,0,False,"<a href=""https://www.google.com/"" rel=""nofollo...",I added a video to a @YouTube playlist https:/...,1529844058224,False,"{'id': 134038538, 'id_str': '134038538', 'name..."
4,,,Sun Jun 24 12:40:58 +0000 2018,"[0, 24]","{'hashtags': [], 'urls': [{'url': 'https://t.c...",,,0,False,low,...,1.0106460156938036e+18,"{'url': 'https://t.co/e8MnnplVAZ', 'expanded':...",0,0,False,"<a href=""http://twitter.com/download/android"" ...",Top 10 Anime Plot Twists https://t.co/e8MnnplVAZ,1529844058836,False,"{'id': 947280390229594113, 'id_str': '94728039..."


In [36]:
tweet_df['text'] = tweet_df['text'].map(clean_text)
tweet_df = tweet_df[['text']]

In [37]:
tweet_df.head()

Unnamed: 0,text
0,an art an intellectual
1,The statement as a whole is semantically null ...
2,hoya1991 yes i did too when i first started th...
3,I added a video to a playlist Best Underrated ...
4,Top 10 Anime Plot Twists


In [38]:
tweet_sequences = tokenizer.texts_to_sequences(tweet_df['text'])
padded_tweet_sequences = pad_sequences(tweet_sequences, maxlen=MAX_LENGTH)

In [39]:
y_pred_tweet = rnn_simple_model.predict(padded_tweet_sequences, verbose=1, batch_size=2048)
y_pred_tweet = pd.DataFrame(y_pred_tweet, columns=['prediction'])
y_pred_tweet['prediction'] = y_pred_tweet['prediction'].map(lambda p: 1 if p >= 0.5 else 0)



In [40]:
tweet_df = pd.merge(tweet_df, y_pred_tweet, left_on=tweet_df.index, right_on=y_pred_tweet.index, how='outer')

In [41]:
for i in range(tweet_df.shape[0]):
    print(tweet_df.loc[i]['text'], tweet_df.loc[i]['prediction'])

an art an intellectual 1
The statement as a whole is semantically null actually There s no dial to increase or decrease the 0
hoya1991 yes i did too when i first started the anime it s actually quite common among seiyuu you don t im 1
I added a video to a playlist Best Underrated Anime Series To Watch 5 1
Top 10 Anime Plot Twists 1
LoL wut 1
this has started as a series in germany just this month 1
Sterl That guy rarely does anime He s more prominent in 0
anime list blog Cutie Honey Universe12 1
It s funny because how I like both kpop and anime but when I try to combine two things I like anime fans come to m 1
RT TO JOIN IN ANIMEBOYISM GC W is actually a cult we worship anime boys more on filos 1
i don t like this but i like this 0
The anime is so baaaaaaaaaaaad 0
Zinba Free Anime Green Screen via 1
Boys Love Sexy Anime Pokemon Gardevoir Hentai Hugging Body Pillow Case Cover 1
Changes XXXTentacion sad anime via 0
Ok sorry this emoji is cringe 0
catching up on anime is such a chore 0
Ex