In [1]:
import pandas as pd
import numpy as np
import re
import ast
import json

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from scipy import sparse

from utils.utils import isSubArray
from utils.phrase_breaker import phrase_breaker

### Loading Data

In [11]:
with open('data/HotelOpinionsJSON-FirstSet.jsonc', 'r') as f:
  json_data = json.load(f)

In [13]:
data = pd.DataFrame()

for review in json_data["Reviews"]:
    data = data.append(review, ignore_index=True)

  data = data.append(review, ignore_index=True)
  data = data.append(review, ignore_index=True)
  data = data.append(review, ignore_index=True)
  data = data.append(review, ignore_index=True)
  data = data.append(review, ignore_index=True)
  data = data.append(review, ignore_index=True)


Unnamed: 0,Id,Opinion,Rate,Elements
0,0,"Great hotel, beautiful rooms and excellent ser...",5,"[{'Element': 'room', 'Occurrence': '2'}, {'Ele..."
1,0,Lovely hotel in an excellent location. Rooms a...,5,"[{'Element': 'location', 'Occurrence': '2'}, {..."
2,0,In chronological order; arrived at the hotel (...,1,"[{'Element': 'parking', 'Occurrence': '2'}, {'..."
3,0,"The hotel is located in Braga Street, excellen...",4,"[{'Element': 'location', 'Occurrence': '2'}, {..."
4,0,Pros: - Strategically placed in the middle of...,3,"[{'Element': 'location', 'Occurrence': '1'}, {..."


### Text Cleaning

In [None]:
# Applying phrase breaker to reviews
data["Opinion"] = data["Opinion"].apply(phrase_breaker)

In [None]:
data["Opinion"] = data["Opinion"].str.replace(r'[^\w\s]', '')

In [None]:
data["Opinion"] = data["Opinion"].apply(word_tokenize)

In [None]:
def lower_casing(words):
    lower_case_words = []
    for word in words:
        lower_case_words.append(word.lower())
    return lower_case_words

data["Opinion"] = data["Opinion"].apply(lower_casing)

In [None]:
pos_tagged_vocab = data["Opinion"].apply(pos_tag)

In [None]:
reviews_tags = []

for items in data["Opinion"]:
    review_tags = []
    for item in items:
        review_tags.append(item[1])
    reviews_tags.append(review_tags)

data["POS Tags"] = review_tags

In [None]:
def select_categories(pos_tagged_words):
    pos_tags_filter = ["JJ", "JJR", "JJS", "NN", "NNS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
    selected_words = []
    for word, pos_tag in pos_tagged_words:
        if pos_tag in pos_tags_filter:
            selected_words.append(word)
    return selected_words

data["Processed Opinion"] = pos_tagged_vocab.apply(select_categories)

In [None]:
# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatization(sentence):
    lemmatized_sentence = []

    for word in sentence:
        lemmatized_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
    
    return lemmatized_sentence

data["Processed Opinion"] = data["Processed Opinion"].apply(lemmatization)

In [None]:
data["Processed Opinion"] = data["Processed Opinion"].apply(lambda x: re.sub(r"\b[a-zA-Z]\b", "", x))

### BOW Vector creation

In [None]:
bow_stars_array = vectorizer.transform(data["Processed Opinion"].values.astype('U'))

### Pattern Matching

In [None]:
star_patterns = pd.read_csv("data/star_patterns.csv", names=["One Star", "Two Star", "Three Star", "Four Star", "Five Star"])

In [None]:
star_patterns["One Star"] = star_patterns["One Star"].apply(ast.literal_eval)
star_patterns["Two Star"] = star_patterns["Two Star"].apply(ast.literal_eval)
star_patterns["Three Star"] = star_patterns["Three Star"].apply(ast.literal_eval)
star_patterns["Four Star"] = star_patterns["Four Star"].apply(ast.literal_eval)
star_patterns["Five Star"] = star_patterns["Five Star"].apply(ast.literal_eval)

In [None]:
bow_occurences_array = np.zeros((data["POS Tags"].shape[0], 5))

In [None]:
def check_occurrences_stars(data, star_grams, col_index, stars, occ_array):
    count = 0
    for index, row in data.iterrows():
        if row["Rate"] == stars:
            for n_gram in star_grams:
                if isSubArray(row["POS Tags"], n_gram, len(row["POS Tags"]), len(n_gram)):
                    occ_array[count, col_index] = 1
        count += 1

check_occurrences_stars(data["POS Tags"], star_patterns["One Star"], 0, 1, bow_occurences_array)
check_occurrences_stars(data["POS Tags"], star_patterns["Two Star"], 1, 2, bow_occurences_array)
check_occurrences_stars(data["POS Tags"], star_patterns["Three Star"], 2, 3, bow_occurences_array)
check_occurrences_stars(data["POS Tags"], star_patterns["Four Star"], 3, 4, bow_occurences_array)
check_occurrences_stars(data["POS Tags"], star_patterns["Five Star"], 4, 5, bow_occurences_array)

In [None]:
bow_occurrences_matrix = sparse.csr_matrix(bow_occurences_array)

In [None]:
bow_stars_array_opt = sparse.hstack([bow_stars_array, bow_occurrences_matrix])