In [None]:
file_review = "../input/yelp-reviews/yelp_reviews_100_thousand.csv"

In [None]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Layer
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import seaborn as sns
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
df = pd.read_csv(file_review)
df.info()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.text.tolist()[0] #review example

In [None]:
df.business_id.value_counts() #Number of reviews for each restaurant

In [None]:
df.loc[df['stars'] == 1, 'stars'] = 0
df.loc[df['stars'] == 2, 'stars'] = 0
df.loc[df['stars'] == 3, 'stars'] = 1
df.loc[df['stars'] == 4, 'stars'] = 2
df.loc[df['stars'] == 5, 'stars'] = 2

In [None]:
df.stars.value_counts()

In [None]:
df.stars.value_counts().sort_values(ascending=False).plot(kind='bar', title='Number of reviews with each rating')

# Preprocessing

Cleaning up the review

In [None]:
!pip install contractions

In [None]:
!pip install inflect

In [None]:
# Importing required libraries
import nltk
import inflect
import contractions
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

# First function is used to denoise text
def denoise_text(text):
    # Strip html if any. For ex. removing <html>, <p> tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # Replace contractions in the text. For ex. didn't -> did not
    text = contractions.fix(text)
    return text

# Check the function 
sample_text = "<p>he didn't say anything </br> about what's gonna <html> happen in the climax"
denoise_text(sample_text)

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
# Text normalization includes many steps.
# Each function below serves a step.
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas
def normalize_text(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    #words = stem_words(words)
    words = lemmatize_verbs(words)
    return words

# Testing the functions
print("remove_non_ascii results: ", remove_non_ascii(['h', 'ॐ', '©', '1']))
print("to_lowercase results: ", to_lowercase(['HELLO', 'hiDDen', 'wanT', 'GOING']))
print("remove_punctuation results: ", remove_punctuation(['hello!!', 'how?', 'done,']))
print("replace_numbers results: ", replace_numbers(['1', '2', '3']))
print("remove_stopwords results: ", remove_stopwords(['this', 'and', 'amazing']))
print("stem_words results: ", stem_words(['beautiful', 'flying', 'waited']))
print("lemmatize_verbs results: ", lemmatize_verbs(['hidden', 'walking', 'ran']))
print("normalize_text results: ", normalize_text(['hidden', 'in', 'the', 'CAVES', 'he', 'WAited', '2', 'ॐ', 'hours!!']))

In [None]:
# Tokenize review into words
def tokenize(text):
    return nltk.word_tokenize(text)
# check the function
sample_text = 'he did not say anything  about what is going to  happen'
print("tokenize results :", tokenize(sample_text))

In [None]:
def text_prepare(text): #This code takes very long to run
    text = denoise_text(text)
    text = ' '.join([x for x in normalize_text(tokenize(text))])
    return text
df['text'] = [text_prepare(x) for x in df['text']]
le = LabelEncoder()
df['stars'] = le.fit_transform(df['stars'])
df.head()

Glove

In [None]:
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional, Layer, Input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from tensorflow.keras.regularizers import l2
import keras.backend as K
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
import numpy as np
import pickle
import matplotlib.pyplot as plt
import warnings
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
def prepare_model_input(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    # pickle.dump(tokenizer, open('text_tokenizer.pkl', 'wb'))
    # Uncomment above line to save the tokenizer as .pkl file 
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train_Glove = text[0:len(X_train), ]
    X_test_Glove = text[len(X_train):, ]
    embeddings_dict = {}
    f = open("../input/glove-embedding/glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_dict[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_dict))
    return (X_train_Glove, X_test_Glove, word_index, embeddings_dict)

# Check function
x_train_sample = ["Lorem Ipsum is simply dummy text of the printing and typesetting industry", "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout"]
x_test_sample = ["I’m creating a macro and need some text for testing purposes", "I’m designing a document and don’t want to get bogged down in what the text actually says"]
X_train_Glove_s, X_test_Glove_s, word_index_s, embeddings_dict_s = prepare_model_input(x_train_sample, x_test_sample, 100, 20)
print("\n X_train_Glove_s \n ", X_train_Glove_s)
print("\n X_test_Glove_s \n ", X_test_Glove_s)
print("\n Word index of the word testing is : ", word_index_s["testing"])
print("\n Embedding for thw word want \n \n", embeddings_dict_s["want"])

In [None]:
def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*(precision*recall))/(precision+recall)
    return {
        "mcc": mcc,
        "true positive": tp,
        "true negative": tn,
        "false positive": fp,
        "false negative": fn,
        "pricision" : precision,
        "recall" : recall,
        "F1" : f1,
        "accuracy": (tp+tn)/(tp+tn+fp+fn)
    }
def compute_metrics(labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
X = df.text
y = df.stars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

print("Preparing model input ...")
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(X_train,X_test)
print("Done!")
print("Building Model!")

In [None]:
print("Longest review's length is ", max([len(review) for review in df['text']]))

In [None]:
# Max number of words in each review.
MAX_SEQUENCE_LENGTH = 3540
# Make the embedding matrix using the embedding_dict
EMBEDDING_DIM = 200

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        if len(embedding_matrix[i]) != len(embedding_vector):
            print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                  "into shape", str(len(embedding_vector)), " Please make sure your"
                                                            " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
            exit(1)
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH,trainable=True)

Preprocessing Word2Vec

In [None]:
nltk.download('stopwords')

In [None]:
# DATASET
DATASET_COLUMNS = ["id", "sentiment","content"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 30
BATCH_SIZE = 1024

In [None]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
df.text = df.text.apply(lambda x: preprocess(x))

In [None]:
df.head()

In [None]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

In [None]:
documents = [_content.split() for _content in df_train.text] 

In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

In [None]:
w2v_model.build_vocab(documents)

In [None]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size:", vocab_size)

In [None]:
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

In [None]:
w2v_model.most_similar("great")

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words: ", vocab_size)

In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)

In [None]:
labels = df_train.stars.unique().tolist()
print(labels)

In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.stars.tolist())

y_train = encoder.transform(df_train.stars.tolist())
y_test = encoder.transform(df_test.stars.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [None]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

In [None]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

Define model and train

In [None]:
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

In [None]:
def build_cnn_lstm(nclasses,  embedding_layer):
    from keras.layers import Conv1D, MaxPooling1D
    from tensorflow.keras.callbacks import ModelCheckpoint
    from tensorflow.keras.regularizers import l2
    # Initialize a sequebtial model
    model = Sequential()
    # model.add(Input(500,))
    
    # Add embedding layer
    model.add(embedding_layer)
    model.add(Conv1D(filters=64, kernel_size=2, padding='valid', activation='relu'))
    model.add(Conv1D(filters=64, kernel_size=3, padding='valid', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(250, kernel_regularizer=l2(0.01)))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = build_cnn_lstm(3, embedding_layer)
model.summary()

In [None]:
# Early Stopping
#es = EarlyStopping(monitor='val_loss')
# history = model.fit(X_train_Glove, y_train,
#                     validation_data=(X_test_Glove,y_test),
#                     epochs=30,
#                     batch_size=128,
#                     #callbacks=[es],
#                     verbose=1)
history = model.fit(x_train, y_train,
                    validation_split=0.1,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    #callbacks=[es],
                    verbose=1)

In [None]:
predicted = model.predict_classes(X_test_Glove)
index = 16
max_index = max(predicted[index])
le.inverse_transform([list(predicted[index]).index(max_index)])

In [None]:
X_test[10:20]

incorporating glove

In [None]:
X_train_Glove, X_test_Glove, word_index, embeddings_dict = prepare_model_input(Xtrain,Xtest)

EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 3540

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        if len(embedding_matrix[i]) != len(embedding_vector):
            print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                  "into shape", str(len(embedding_vector)), " Please make sure your"
                                                            " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
            exit(1)
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.regularizers import l2
embedding_vecor_length = 32
model = Sequential()
#model.add(Embedding(MAX_NB_WORDS, embedding_vecor_length, input_length=X_train.shape[1]))
# Add embedding layer
model.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model.add(Conv1D(filters=64, kernel_size=2, padding='valid', activation='relu'))
model.add(Conv1D(filters=64, kernel_size=3, padding='valid', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(250, kernel_regularizer=l2(0.01)))
model.add(Dense(3, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
filepath="weights_best_cnn.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train_Glove, Ytrain, epochs=100, batch_size=128,verbose = 1,callbacks = callbacks_list,validation_data=(X_test_Glove,Ytest))

In [None]:
from tensorflow import argmax
from tensorflow.keras.backend import get_value
results = model.predict(X_test_Glove)
#df.sentiment[argmax(x)]
print(set(results))
sentiment_results = [get_value(argmax(x))+1 for x in results]
label = [get_value(argmax(x))+1 for x in Ytest]
print("Predicted\tstars\ttext")
#Xtest = Xtest.tolist() #Uncomment or Comment this when there is error
wrong = []
right = []
for i in range(len(sentiment_results)):
  if label[i]!=sentiment_results[i]:
    wrong.append([sentiment_results[i], label[i], Xtest[i]])
  else:
    right.append([sentiment_results[i], label[i], Xtest[i]])
    
#print(len(sentiment_results),len(Y_test))
    
print("WRONG")
print("\t".join([str(x) for x in wrong[0]]))
print("\t".join([str(x) for x in wrong[1]]))
print("\t".join([str(x) for x in wrong[2]]))
print("\n")

f = open("wrong.txt", "w")
f.write("Predicted\tstars\ttext")
f.write("\n")
for review in wrong:
    f.write("\t".join([str(x) for x in review]))
    f.write("\n")
f.close()

count_model_sad = 0
count_model_happy = 0
for i in wrong:
    if(i[0]<3 and i[1]>3):
        count_model_sad += 1
    elif(i[1]<3 and i[0]>3):
        count_model_happy +=1
print("Proportion of model classifies positive as negative out of wrong: ", count_model_sad/len(wrong))
print("Proportion of model classifies negative as positive out of wrong: ", count_model_happy/len(wrong))
print("\n")

f = open("right.txt", "w")
f.write("Predicted\tstars\ttext")
f.write("\n")
for review in right:
    f.write("\t".join([str(x) for x in review]))
    f.write("\n")
f.close()

print("RIGHT")
print("\t".join([str(x) for x in right[0]]))
print("\t".join([str(x) for x in right[1]]))
print("\t".join([str(x) for x in right[2]]))
print("\n")

print("accuracy: ",len(right)/(len(wrong)+len(right)))

Can do confusion matrix ratings vs outputs
Add in yz's prepocessing

In [None]:
classifier_report(Y_test, results)