In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rajeshkc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajeshkc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from typing import List, Tuple

import nltk


class Review:
    def __init__(self, review_id: str, product_id: str, age: int, title: str, review_text: str, rating: int,
                 recommends: bool, upvotes: int, division_name: str, department_name: str, class_name: str):
        self.review_id = review_id
        self.product_id = product_id
        self.age = age
        self.title = title
        self.review_text = review_text
        self.rating = rating
        self.recommends = recommends
        self.upvotes = upvotes
        self.division_name = division_name
        self.department_name = department_name
        self.class_name = class_name

    def full_text(self) -> str:
        return f"{self.title} {self.review_text}"

    def words(self) -> List[str]:
        return nltk.word_tokenize(self.full_text())

    def tagged_words(self) -> List[Tuple[str, str]]:
        """ Returns each word in `full_text()` along with its predicted POS-tag. For instance, if `full_text()` is
            "I love this red shirt", the returned list will be:
            [('I', 'PRP'), ('love', 'VBP'), ('this', 'DT'), ('red', 'JJ'), ('shirt', 'NN')]
            where each of the tags refer to a specific grammatical class.
            See: https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/ for a list of all
            possible tags"""
        return nltk.pos_tag(self.words())


In [None]:
import pandas as pd
from typing import List
import os

from model.Review import Review


data_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, "data"))

def load_reviews() -> List[Review]:
    print("Loading dataset ...")
    path = os.path.join(data_dir, "reviews.csv")
    column_names = ["review_id", "product_id", "age", "title", "review_text", "rating", "recommends", "upvotes",
                    "division_name", "department_name", "class_name"]
    data = pd.read_csv(path, delimiter=",", names=column_names)
    data.fillna("")
    dataset = []
    for _, row in data.iterrows():
        try:
            entry = Review(str(row["review_id"]), str(row["product_id"]), int(row["age"]), str(row["title"]),
                           str(row["review_text"]), int(row["rating"]), bool(int(row["recommends"])),
                           int(row["upvotes"]), str(row["division_name"]), str(row["department_name"]), str(row["class_name"]))
            dataset.append(entry)
        except:
            pass
    print(f"Dataset loaded ({len(dataset)} rows).")
    return dataset
