In [82]:
import re
import nltk
import functools
import operator
import pandas as pd
from collections import Counter

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB

normalizer = WordNetLemmatizer()

In [83]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vabis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vabis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vabis\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vabis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [84]:
STOPWORDS = stopwords.words('english')
file_path = "data/DisneylandReviews.csv"

In [85]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          1008 non-null   int64 
 1   Rating             1008 non-null   int64 
 2   Year_Month         1008 non-null   object
 3   Reviewer_Location  1008 non-null   object
 4   Review_Text        1008 non-null   object
 5   Branch             1008 non-null   object
dtypes: int64(2), object(4)
memory usage: 31.6+ KB


In [87]:
def get_language_code(text):
    try:
        return detect(text)
    except LangDetectException:
        return np.nan

In [88]:
# Bottleneck on large datasets!
df["language_code"] = df.apply(lambda row: get_language_code(row["Review_Text"]), axis=1)

In [89]:
# Get none english row count
df.shape[0] - df[df["language_code"] == "en"].shape[0]

0

In [90]:
def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech



def clean_text(text):
    text = re.sub(r'http\S+', ' ', text)
    cleaned = re.sub(r'\W+', ' ', text).lower()
    tokenized = word_tokenize(cleaned)
    normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
    return normalized

# def clean_text(text: str):
#     # remove and replace all urls
#     text = re.sub(r'http\S+', ' ', text)

#     # remove and replace none alphanumerical letters
#     text = re.sub(r'\W+', ' ', text.lower())

#     words = []
#     for word in text.split():
#         if word in STOPWORDS:
#             continue
#         words.append(Word(word).lemmatize())
#     return " ".join(words)

In [91]:
# df['text_cleaned'] = df['Review_Text'].apply(clean_text)
df['tokens'] = df['Review_Text'].apply(clean_text)
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,language_code,tokens
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,en,"[if, you, ve, ever, be, to, disneyland, anywhe..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,en,"[it, be, a, while, since, d, last, time, we, v..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,en,"[thanks, god, it, wasn, t, too, hot, or, too, ..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,en,"[hk, disneyland, be, a, great, compact, park, ..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,en,"[the, location, be, not, in, the, city, take, ..."


In [92]:
print(df.iloc[0]['Review_Text'])


If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well. 


In [97]:
def create_features_dictionary(texts):
  features_dictionary  = {}
  merged = ' '.join(texts)
  tokens = clean_text(merged)
  index = 0
  for token in tokens:
    if token not in features_dictionary:
      features_dictionary[token] = index
      index += 1
  return features_dictionary

def text_to_bow_vector(text, features_dictionary ):
  bow_vector = [0 for key in features_dictionary.keys()]
  tokens = clean_text(text)
  for token in tokens:
    feature_index = features_dictionary[token]
    bow_vector[feature_index] += 1
  return bow_vector, tokens

def tokens_to_bow_vector(tokens, features_dictionary):

  bow_vector = [0] * len(features_dictionary)
  for token in tokens:
    if token in features_dictionary:
      feature_index = features_dictionary[token]
      bow_vector[feature_index] += 1
  return bow_vector

In [98]:
features_dictionary = create_features_dictionary(df['Review_Text'])
# print(features_dictionary)

In [107]:
vector = text_to_bow_vector(df.iloc[0]['Review_Text'], features_dictionary)[0]
df['bow_vector'], df['tokens'],  = zip(*df['Review_Text'].apply(text_to_bow_vector, args=(features_dictionary,)))


In [None]:
print(df.iloc[0]['tokens'])
print(df.iloc[0]['bow_vector'])

In [123]:
training_vectors = df.iloc[0:700]['bow_vector'].tolist()
test_vectors = df.iloc[701:900]['bow_vector'].tolist()
rest_data = df.iloc[901:]['bow_vector'].tolist()
# print(training_vectors[0], len(training_vectors))
# print(test_vectors[0], len(test_vectors))

In [132]:
classifier = MultinomialNB()
classifier.fit(training_vectors, df[0:700]['Rating'].tolist())
predictions = classifier.score(test_vectors, df[701:900]['Rating'].tolist())

In [131]:
print(predictions)
classifier.predict([training_vectors[0]])


0.49246231155778897


array([4])