<a href="https://colab.research.google.com/github/binhvd/Data-Analytics-3-Solutions/blob/main/10_Sentiment_Analysis_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup the Environment

In [1]:
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, SimpleRNN, Activation, Dropout, Conv1D
from tensorflow.keras.layers import Embedding, Flatten, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

import pandas as pd
import numpy as np
import spacy
from sklearn.metrics import classification_report

In [16]:
# Fix Colab bug: https://github.com/googlecolab/colabtools/issues/3409
import locale
locale.getpreferredencoding = lambda do_setlocale: "UTF-8"

## Exploratory Data Analysis

In [4]:
data = pd.read_csv("https://storage.googleapis.com/srh-dataset/sentiment-analysis/tweeter.csv", header=None, encoding='latin-1')
data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
# Check for missing values
data.isnull().any()

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

## Data Preparation

In [8]:
!pip install contractions
!pip install textsearch
!pip install tqdm

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contrac

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
import contractions
from bs4 import BeautifulSoup
import re
import tqdm
import unicodedata

from nltk.stem import SnowballStemmer 
stemmer = SnowballStemmer('english')

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_stopwords_and_stemming(text, stem):
    tokens = []
    for token in text.split():
      if token not in stopwords:
        # chops off the ends of words
        if stem:
          tokens.append(stemmer.stem(token))
        else:
          tokens.append(token)
    return " ".join(tokens)   

def pre_process_corpus(docs, stem = False):
    norm_docs = []
    # tqdm to display a progess bar while looping
    for doc in tqdm.tqdm(docs):
        # remove HTML tags
        doc = strip_html_tags(doc)

        # convert tab, new lines to empty spaces    
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))

        # remove URL
        doc = re.sub(r'http\S+', '', doc)

        # lowercase
        doc = doc.lower()

        # remove accented chars
        doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        # expand shortened words, e.g. don't to do not
        doc = contractions.fix(doc)

        # remove @username
        doc = re.sub('@([A-Za-z0-9_]+)', ' ', doc)

        # Replace all non alphabets.
        doc = re.sub('[^a-zA-Z]', ' ', doc)

        # Single character removal
        #doc = re.sub(r"\s+[a-zA-Z]\s+", ' ', doc)
        
        # remove white spaces
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()  

        # remove stop words and apply stemming
        doc = remove_stopwords_and_stemming(doc, stem)

        norm_docs.append(doc)        
    return norm_docs

In [10]:
data_X = pre_process_corpus(data[5])

  soup = BeautifulSoup(text, "html.parser")
100%|██████████| 20000/20000 [00:05<00:00, 3929.55it/s]


In [11]:
print(data[5][0])
print(data_X[0])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
awww bummer shoulda got david carr third day


### Preparing Data

We only care about the tweet text and tweet sentiment information, which stored in the 5th column and 0th column in the dataset. In the sentiment column, 0 represents negative, and 1 represents positive.

We organize the data as data_Xcontains all the tweet text, data_y contains the labels.

The following code will convert the tweet text data_X to sequence format that will be feed into RNNs

In [12]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import gensim.downloader as api

# Load the twitter embeddings model. This model is trained on 2 billion tweets, which contains 27 billion tokens, 1.2 million vocabs.
# might take a while
glove_model = api.load("glove-twitter-200")



In [17]:
data_X = data[5]
print(data_X)

0        @switchfoot http://twitpic.com/2y1zl - Awww, t...
1        is upset that he can't update his Facebook by ...
2        @Kenichan I dived many times for the ball. Man...
3          my whole body feels itchy and like its on fire 
4        @nationwideclass no, it's not behaving at all....
                               ...                        
19995    Just woke up. Having no school is the best fee...
19996    TheWDB.com - Very cool to hear old Walt interv...
19997    Are you ready for your MoJo Makeover? Ask me f...
19998    Happy 38th Birthday to my boo of alll time!!! ...
19999    happy #charitytuesday @theNSPCC @SparksCharity...
Name: 5, Length: 20000, dtype: object


In [18]:
data_y = pd.get_dummies(data[0]).to_numpy()
print(data_y)

[[1 0]
 [1 0]
 [1 0]
 ...
 [0 1]
 [0 1]
 [0 1]]


## Splitting Data for training

In [19]:
from sklearn.model_selection import train_test_split
train_X, valid_X, train_y, valid_y = train_test_split(data_X, data_y, test_size = 0.2, random_state=42)

## Tokenization

In [20]:
max_vocab = 18000
max_len = 15
tokenizer = Tokenizer(num_words=max_vocab)

In [21]:
tokenizer.fit_on_texts(train_X)

train_X = tokenizer.texts_to_sequences(train_X)
valid_X = tokenizer.texts_to_sequences(valid_X)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 26000 unique tokens.


In [22]:
train_X = pad_sequences(train_X, maxlen=max_len, padding="post")
valid_X = pad_sequences(valid_X, maxlen=max_len, padding="post")

train_X.shape

(16000, 15)

In [23]:
train_X

array([[  47, 4062,   33, ...,  687, 2036,  337],
       [   3,  197, 8118, ...,  780,  130,   36],
       [8119,  126,  108, ...,  688,  108,   96],
       ...,
       [   5, 1052,  239, ...,    9,    0,    0],
       [ 814,   31,   13, ...,    0,    0,    0],
       [   6,    1,  827, ...,    0,    0,    0]], dtype=int32)

### Preparing Word Embeddings using the GloVe Model

In [24]:
EMBED_SIZE = 200

In [25]:
# calcultaete number of words
nb_words = len(word_index) + 1
print('All words: ', nb_words)

# obtain the word embedding matrix
embedding_matrix = np.zeros((nb_words, EMBED_SIZE))
for word, i in word_index.items():
    if word in glove_model:
        embedding_matrix[i] = glove_model[word]

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

All words:  26001
Null word embeddings: 10327


**Explanation of the steps performed till now**

Tweets: Is upset that he can't update his Facebook..

Expected Input to RNN model - 
Is - Embeddings [200] (32)

upset - Embeddings [200] (450)

that - Embeddings [200] (43)

he - Embeddings [200] (56)

1. Vocabulary of all tweets: 30257 unique tokens
2. Unique token IDs: ID (1, 2, 3, 4... for all the 30257 tokens)
3. Tweets represented as the sequence of IDs [32 450 43 56 ...]

Padding: 
"Commonly in RNN's, we take the final output or hidden state and use this to make a prediction (or do whatever task we are trying to do).
If we send a bunch of 0's to the RNN before taking the final output (i.e. 'post' padding as you describe), then the hidden state of the network at the final word in the sentence would likely get 'flushed out' to some extent by all the zero inputs that come after this word.
So intuitively, this might be why pre-padding is more popular/effective." - [link](https://stackoverflow.com/questions/46298793/how-does-choosing-between-pre-and-post-zero-padding-of-sequences-impact-results)

Padding for RNNs - [Link](https://datascience.stackexchange.com/questions/49168/padding-sequences-for-neural-sequence-models-rnns)

[Paper](https://arxiv.org/abs/1903.07288)





### Build RNN Models

In [26]:
# adopted from sent_tran_eval.py
def build_model(nb_words, rnn_model="SimpleRNN", embedding_matrix=None):
    '''
    build_model function:
    inputs: 
        rnn_model - which type of RNN layer to use, choose in (SimpleRNN, LSTM, GRU)
        embedding_matrix - whether to use pretrained embeddings or not
    '''
    model = Sequential()
    # add an embedding layer
    if embedding_matrix is not None:
        model.add(Embedding(nb_words, 
                        EMBED_SIZE, 
                        weights=[embedding_matrix], 
                        input_length= max_len,
                        trainable = False))
    else:
        model.add(Embedding(nb_words, 
                        EMBED_SIZE, 
                        input_length= max_len,
                        trainable = True))
        
    # add an RNN layer according to rnn_model
    if rnn_model == "SimpleRNN":
        model.add(SimpleRNN(EMBED_SIZE))
    elif rnn_model == "LSTM":
        model.add(LSTM(EMBED_SIZE))
    else:
        model.add(GRU(EMBED_SIZE))
    # model.add(Dense(500,activation='relu'))
    # model.add(Dense(500, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])
    return model

### Training and Evaluation


Train and evaluate the SimpleRNN, LSTM, and GRU networks on our prepared dataset.

We are using the pre-trained word embeddings from the glove.twitter.27B.200d.txt data. Using the pre-trained word embeddings as weights for the Embedding layer leads to better results and faster convergence.

We set each models to run 20 epochs, but we also set EarlyStopping rules to prevent overfitting. The results of the SimpleRNN, LSTM, GRU models can be seen below.

In [27]:
embedding_matrix.shape

(26001, 200)

In [28]:
model_rnn = None
model_rnn = build_model(nb_words, "SimpleRNN", embedding_matrix)
model_rnn.fit(train_X, train_y, epochs=20, batch_size=120,
          validation_data=(valid_X, valid_y), callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3))

predictions = model_rnn.predict(valid_X)
predictions = predictions.argmax(axis=1)
print(classification_report(valid_y.argmax(axis=1), predictions))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
              precision    recall  f1-score   support

           0       0.68      0.85      0.75      2018
           1       0.79      0.59      0.67      1982

    accuracy                           0.72      4000
   macro avg       0.74      0.72      0.71      4000
weighted avg       0.74      0.72      0.71      4000



## In-Class Assignment

### Try training the RNNs without the pre-trained word embeddings and compare the results with the pre-trained model.

1. Does word embeddings impact the accuracy?
2. If we tweak the units in RNN does it impact the accuracy?
3. How data pre-processing impact the accuracy? (Pre and post padding)

## LSTM and GRUs

In [29]:
model_lstm = build_model(nb_words, "LSTM", embedding_matrix)
model_lstm.fit(train_X, train_y, epochs=20, batch_size=120,
          validation_data=(valid_X, valid_y), callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3))

predictions = model_lstm.predict(valid_X)
predictions = predictions.argmax(axis=1)
print(classification_report(valid_y.argmax(axis=1), predictions))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
              precision    recall  f1-score   support

           0       0.80      0.69      0.74      2018
           1       0.73      0.82      0.77      1982

    accuracy                           0.76      4000
   macro avg       0.76      0.76      0.76      4000
weighted avg       0.76      0.76      0.76      4000



In [30]:
model_gru = build_model(nb_words, "GRU", embedding_matrix)
model_gru.fit(train_X, train_y, epochs=20, batch_size=120,
          validation_data=(valid_X, valid_y), callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3))

predictions = model_gru.predict(valid_X)
predictions = predictions.argmax(axis=1)
print(classification_report(valid_y.argmax(axis=1), predictions))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      2018
           1       0.75      0.76      0.76      1982

    accuracy                           0.76      4000
   macro avg       0.76      0.76      0.76      4000
weighted avg       0.76      0.76      0.76      4000

