In [59]:
import pandas as pd
import numpy as np
import string

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

In [60]:
dataset = pd.read_csv(r'https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2011%20-%20Sentiment%20Analysis%20-%20Unsupervised%20Learning/movie_reviews.csv.bz2', compression='bz2')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [62]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [63]:
# build train and test datasets
reviews = dataset['review'].values
sentiments = dataset['sentiment'].values

train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]

test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

In [64]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re
import tqdm
import numpy as np
from stop_words import get_stop_words

import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata
import gensim

In [65]:
def stripHTMLtags(text) : 
    soup  = BeautifulSoup(text,"html.parser")
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n\|\r\n]+','\n',stripped_text)
    return stripped_text

def strip_email(text):
    text = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+","",text)
    return text
        
def strip_digits(text):
    text = " ".join(word for word in text.split() if not any(character.isdigit() for character in word))
    return text

def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def strip_common_words(text,stemming = False) :
    stop_words = get_stop_words("en")
    if stemming :
        text = [word for word in text.split() if word not in stem(stop_words)]
    else :
        text = [word for word in text.split() if word not in stop_words]
    return text

In [87]:
def pre_process_corpus(docs, stemming = False, rm_common = False):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = stripHTMLtags(doc)
    doc = strip_email(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = remove_accented_chars(doc)
    doc = contractions.fix(doc)
    for symbol in string.punctuation:
        doc = doc.replace(symbol," ")
    doc = strip_digits(doc)
    #lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    text = gensim.utils.to_unicode(doc).split()
    text = [word.lower() if not word.isupper() else word for word in text]
    
    if stemming :
        text = [stem(word) for word in text]
        
    if rm_common :
        text = strip_common_words(text)
    
    norm_docs.append(" ".join(text))
  
  return norm_docs

In [88]:
norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|██████████| 35000/35000 [03:27<00:00, 168.71it/s]
100%|██████████| 15000/15000 [01:24<00:00, 177.70it/s]


In [92]:
import tensorflow as tf

t = tf.keras.preprocessing.text.Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(norm_train_reviews)
t.word_index['<PAD>'] = 0

In [95]:
train_sequences = t.texts_to_sequences(norm_train_reviews)
test_sequences = t.texts_to_sequences(norm_test_reviews)


In [96]:
print("Vocabulary size={}".format(len(t.word_index)))
print("Number of Documents={}".format(t.document_count))

Vocabulary size=87622
Number of Documents=35000


In [97]:
MAX_SEQUENCE_LENGTH = 1000

In [98]:
# pad dataset to a maximum review length in words
X_train = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_test.shape

((35000, 1000), (15000, 1000))

In [99]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
num_classes=2 # positive -> 1, negative -> 0

In [100]:
y_train = le.fit_transform(train_sentiments)
y_test = le.transform(test_sentiments)

In [101]:
VOCAB_SIZE = len(t.word_index)

In [116]:
EMBEDDING_DIM = 300 # dimension for dense embeddings for each token
LSTM_DIM = 128 # total LSTM units

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size = 2))
model.add(tf.keras.layers.LSTM(units = 20, dropout = 0.5, recurrent_dropout = 0.5))
model.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 1000, 300)         26286600  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1000, 128)         115328    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 500, 128)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 20)                11920     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 21        
Total params: 26,413,869
Trainable params: 26,413,869
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
EPOCHS=10
BATCH_SIZE=128

# callbacks
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=2,
                                      verbose=1)

# Fit the model
model.fit(X_train, y_train, 
          validation_split=0.1,
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE,
          callbacks=[es], 
          verbose=1)

Train on 31500 samples, validate on 3500 samples
Epoch 1/10

In [None]:
scores = model.evaluate(X_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))