# Import modules

In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary
from gensim.models.word2vec import Word2Vec
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
import string

# Load data

In [None]:
df = pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")
df.head(5)

## Data basic information

In [None]:
df.info()

## Response distribution

In [None]:
df.groupby("Rating").count().plot.pie(y="Review",autopct="%.2f%%",figsize=(6,6))

In [None]:
# Here instead of taking the original rating as our class labels
# we will define 5 stars as our positive class and all the rating below 5 stars as our negative class
# because the rating are subjective and for those 1-3 stars, the total proportion is very little
# and would share a lot in common in attitudes or sentiments (negative)
# furthermore, we will include 4 stars in the negative class to make class more balanced
df["Response"] = (df["Rating"]>4).astype(np.int)

# Data split

# No preprocessing

In [None]:
# shuffle the pd.DataFrame before train test split
np.random.seed(1234)
indices = np.arange(df.shape[0])
np.random.shuffle(indices)

# tokenize the reviews
df = df.iloc[indices].reset_index(drop=True)
corpus = df["Review"].apply(word_tokenize)

# train test split
split_ind = 16000 # splitting index
train_corpus = corpus[:split_ind]
test_corpus = corpus[split_ind:]
dct = Dictionary(train_corpus)

In [None]:
train_label = df["Response"].to_numpy()[:split_ind]
test_label = df["Response"].to_numpy()[split_ind:]

## Term-document matrix

In [None]:
# convert the bow format corpus into scipy compressed sparse
# row matrix in order to fit into the machine memory
def corpus2csr(bow,n_terms=None):
    data = []
    indices = []
    indptr = [0,]
    for i,sent in enumerate(bow):
        n_words = len(sent)
        indptr.append(indptr[-1]+n_words)
        for (idx,cnt) in sent:
            indices.append(idx)
            data.append(cnt)
    if n_terms is not None:
        n_col = max(indices)
        assert n_col <= n_terms,"Total columns should be less than n_terms"
        if n_col < n_terms-1:
            paddings = n_terms-n_col-1
            data.extend([0 for _ in range(paddings)])
            indices.extend(list(range(n_col+1,n_terms)))
            indptr[-1] = indptr[-1]+paddings
    return csr_matrix(tuple(map(np.array,[data,indices,indptr])))

In [None]:
n_terms = len(dct.keys())
train_bow = list(map(dct.doc2bow,train_corpus))
train_sparse = corpus2csr(train_bow,n_terms)
test_bow = list(map(dct.doc2bow,test_corpus))
test_sparse = corpus2csr(test_bow,n_terms)
train_sparse.shape,test_sparse.shape

## Naive Bayes classifier

In [None]:
# binary term: present or absent
clf_nb = BernoulliNB()
clf_nb.fit(train_sparse,train_label)
train_acc = clf_nb.score(train_sparse,train_label)
test_acc = clf_nb.score(test_sparse,test_label)
print("Train accuracy is {}".format(train_acc))
print("Test accuracy is {}".format(test_acc))

In [None]:
# multinomial term
clf_nb = MultinomialNB()
clf_nb.fit(train_sparse,train_label)
train_acc = clf_nb.score(train_sparse,train_label)
test_acc = clf_nb.score(test_sparse,test_label)
print("Train accuracy is {}".format(train_acc))
print("Test accuracy is {}".format(test_acc))

## Random Forest classifier

In [None]:
clf_rf = RandomForestClassifier()
clf_rf.fit(train_sparse,train_label)
train_acc = clf_rf.score(train_sparse,train_label)
test_acc = clf_rf.score(test_sparse,test_label)
print("Train accuracy is {}".format(train_acc))
print("Test accuracy is {}".format(test_acc))

# Classical preprocessing

(stemming and removing the stopwords & punctuations)

In [None]:
# stemmer and remove stopwords
stemmer = PorterStemmer()
stopwords_en = stopwords.words("english")
punctuations = [ch for ch in string.punctuation]
stopw_punct = set(stopwords_en+punctuations)
corpus_preprocessed = []
for sent in corpus:
    corpus_preprocessed.append([stemmer.stem(word).lower() for word in sent\
                    if stemmer.stem(word).lower() not in stopw_punct])

# train test split
split_ind = 16000 # splitting index
train_corpus = corpus_preprocessed[:split_ind]
test_corpus = corpus_preprocessed[split_ind:]
dct = Dictionary(train_corpus)

## Term-document matrix

In [None]:
n_terms = len(dct.keys())
train_bow = list(map(dct.doc2bow,train_corpus))
train_sparse = corpus2csr(train_bow,n_terms)
test_bow = list(map(dct.doc2bow,test_corpus))
test_sparse = corpus2csr(test_bow,n_terms)
train_sparse.shape,test_sparse.shape

## Naive Bayes classifier

In [None]:
# binary term: present or absent
clf_nb = BernoulliNB()
clf_nb.fit(train_sparse,train_label)
train_acc = clf_nb.score(train_sparse,train_label)
test_acc = clf_nb.score(test_sparse,test_label)
print("Train accuracy is {}".format(train_acc))
print("Test accuracy is {}".format(test_acc))

In [None]:
# multinomial term
clf_nb = MultinomialNB()
clf_nb.fit(train_sparse,train_label)
train_acc = clf_nb.score(train_sparse,train_label)
test_acc = clf_nb.score(test_sparse,test_label)
print("Train accuracy is {}".format(train_acc))
print("Test accuracy is {}".format(test_acc))

## Random Forest classifier

In [None]:
clf_rf = RandomForestClassifier()
clf_rf.fit(train_sparse,train_label)
train_acc = clf_rf.score(train_sparse,train_label)
test_acc = clf_rf.score(test_sparse,test_label)
print("Train accuracy is {}".format(train_acc))
print("Test accuracy is {}".format(test_acc))

# Word2Vec preprocessing

using CBOW

In [None]:
train_corpus = corpus[:split_ind]
test_corpus = corpus[split_ind:]
w2v_model = Word2Vec(train_corpus,size=128,min_count=3,seed=1234,iter=10)
# extract word embeddings and average on documents
embeddings = w2v_model.wv
vocabulary = set(embeddings.vocab.keys())
train_doc2vec = np.array([np.mean([embeddings[word] for word in sent if word in vocabulary],axis=0) for sent in train_corpus])
test_doc2vec = np.array([np.mean([embeddings[word] for word in sent if word in vocabulary],axis=0) for sent in test_corpus])

In [None]:
# test goodness of embeddings
print(embeddings.most_similar(positive=["man","king"],negative=["woman","queen"]))
print(embeddings.similar_by_word("man"))
print(embeddings.similar_by_word("woman"))

## Logistic regression

In [None]:
clf_lr = LogisticRegression(max_iter=300)
clf_lr.fit(train_doc2vec,train_label)
train_acc = clf_lr.score(train_doc2vec,train_label)
test_acc = clf_lr.score(test_doc2vec,test_label)
print("Train accuracy is {}".format(train_acc))
print("Test accuracy is {}".format(test_acc))

## Random Forest

In [None]:
clf_rf = RandomForestClassifier(max_depth=12)
clf_rf.fit(train_doc2vec,train_label)
train_acc = clf_rf.score(train_doc2vec,train_label)
test_acc = clf_rf.score(test_doc2vec,test_label)
print("Train accuracy is {}".format(train_acc))
print("Test accuracy is {}".format(test_acc))