In [None]:
import nltk
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import dataset_utils

from tqdm import tqdm
from wordcloud import WordCloud
from collections import Counter
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

# Constants

In [None]:
REVIEWS_DATASET_PATH = "./dataset/Grocery_and_Gourmet_Food_5.json"
PRODUCTS_DATASET_PATH = "./dataset/meta_Grocery_and_Gourmet_Food.json"
PREPROCESSED_DATASET_PATH = "./dataset/dataset.csv"
PREPROCESSED_DATASET_REVIEWS_PATH = "./dataset/dataset_prep_reviews.csv"
GENERAL_INFO_PATH = "./dataset/general_info.csv"

# Preprocess the dataset

In [None]:
# We import the dataset with the reviews.
reviews_dataset = pd.read_json(REVIEWS_DATASET_PATH, lines=True)
reviews_dataset

In [None]:
# We import the dataset of the products.
products_dataset = pd.read_json(PRODUCTS_DATASET_PATH, lines=True)
products_dataset

In [None]:
# We print some dataset information before the preprocessing.
reviewers = np.unique(reviews_dataset['reviewerID'].values)
products = np.unique(reviews_dataset['asin'].values)
print("Number of reviews: ", len(reviews_dataset))
print("Number of reviewers: ", len(reviewers))
print("Number of products: ", len(products))

In [None]:
# We preprocess the reviews dataset.
reviews_dataset = dataset_utils.preprocess_reviews_dataset(reviews_dataset)
reviews_dataset

In [None]:
# We merge the reviews datasets with the products dataset.
dataset = dataset_utils.merge_reviews_and_products(reviews_dataset, products_dataset)
dataset

In [None]:
dataset.to_csv(PREPROCESSED_DATASET_PATH, encoding='utf-8', index=False)

In [None]:
# We print some dataset information after the preprocessing.
reviewers = np.unique(dataset['reviewerID'].values)
products = np.unique(dataset['asin'].values)
print("Number of reviews: ", len(dataset))
print("Number of reviewers: ", len(reviewers))
print("Number of products: ", len(products))

In [None]:
# We print the NaN values of the dataset.
dataset.isna().sum()

In [None]:
# We reload the dataset and remove NaN summaries.
dataset = pd.read_csv(PREPROCESSED_DATASET_PATH)
dataset.isna().sum()
dataset = dataset.dropna(subset=['summary'], how='any')
dataset.to_csv(PREPROCESSED_DATASET_PATH, encoding='utf-8', index=False)

In [None]:
# We print statistics for the numeric values.
dataset.describe()

In [None]:
# We preprocess the reviews and save the new dataset.
dataset = dataset_utils.preprocess_reviews(dataset)
dataset.to_csv(PREPROCESSED_DATASET_REVIEWS_PATH, encoding='utf-8', index=False)

# Plot information

In [None]:
dataset = pd.read_csv(PREPROCESSED_DATASET_PATH)
dataset

In [None]:
# We print statistics for the numeric values.
dataset.describe()

In [None]:
# We print the NaN values of the dataset.
dataset.isna().sum()

In [None]:
nan_summ = dataset[dataset['summary'] == "NaN"]
nan_summ

In [None]:
nan_summ = dataset[dataset['summary'].isna()]
nan_summ

In [None]:
# We print the percentages of NaN values of the dataset.
dataset.isna().sum() / len(dataset) * 100

In [None]:
# We print some dataset information after the preprocessing.
reviewers = np.unique(dataset['reviewerID'].values)
products = np.unique(dataset['asin'].values)
print("Number of reviews: ", len(dataset))
print("Number of reviewers: ", len(reviewers))
print("Number of products: ", len(products))

In [None]:
# Number of reviews from verified purchases.
number_verified_purchases = len(dataset[dataset['verified'] == True])
number_reviews = len(dataset)
perc_verified_purchases = number_verified_purchases / number_reviews * 100
print(f"There are {number_verified_purchases} reviews from verified purchases out of {number_reviews} reviews.")
print(f"Percentage of reviews from verified purchases: {perc_verified_purchases}")

In [None]:
# We create and save a dataframe for general information about the preprocessed dataset.
number_products = len(np.unique(dataset['asin'].values))
number_reviewers = len(np.unique(dataset['reviewerID'].values))
average_rating = dataset.describe().loc['mean', 'rating']
general_info_data = {"Number of reviews": [number_reviews], 
                     "Number of products": [number_products], 
                     "Number of reviewers": [number_reviewers], 
                     "Percentage of verified purchases": [perc_verified_purchases],
                     "Average rating": [average_rating]}
general_info = pd.DataFrame(general_info_data)
general_info.to_csv(GENERAL_INFO_PATH, encoding='utf-8', index=False)
general_info

In [None]:
# We plot the distribution of ratings in the preprocessed dataset.
dataset_utils.plot_ratings_distribution(dataset)

In [None]:
# We plot the distribution of opinions.
dataset_utils.plot_opinions_distribution(dataset)

In [None]:
# We plot the distribution of reviews' length.
dataset_utils.plot_reviews_length_distribution(dataset)

In [None]:
# We plot the distribution of summaries' length.
dataset_utils.plot_summaries_length_distribution(dataset)

In [None]:
# We plot the distribution of reviews by their price.
dataset_utils.plot_reviews_price_distribution(dataset)

In [None]:
# We plot the distribution of products by their price.
dataset_utils.plot_products_price_distribution(dataset)

In [None]:
# We plot an analysis of the relation between ratings and prices.
dataset_utils.plot_rating_price_relation(dataset)

In [None]:
verified_purchases = dataset[dataset['verified'] == True]
# We plot the distribution of ratings and opinions of verified purchases.
dataset_utils.plot_ratings_distribution(verified_purchases, verified=True)
dataset_utils.plot_opinions_distribution(verified_purchases, verified=True)

In [None]:
# We plot the distribution of reviews for each year.
dataset_utils.plot_reviews_year_distribution(dataset)

In [None]:
# We plot the top and bottom reviewers for number of reviews.
dataset_utils.plot_reviewers(dataset)

In [None]:
# We plot the top and bottom products for number of reviews.
dataset_utils.plot_products(dataset)

In [None]:
# We compute the average number of reviews per product.
dataset_utils.compute_average_reviews_per_product(dataset)

In [None]:
# We compute the average number of reviews per reviewer.
dataset_utils.compute_average_reviews_per_reviewer(dataset)

In [None]:
# We plot the distribution of the average rating per product.
dataset_utils.plot_average_rating_per_product_distribution(dataset)

In [None]:
# We plot the top rated products by average rating.
dataset_utils.plot_top_rated_products(dataset, k=20)

In [None]:
# We plot the distribution of the average rating per reviewer.
dataset_utils.plot_average_rating_per_reviewer_distribution(dataset)

In [None]:
# We plot relevant information over the years.
dataset_utils.plot_information_over_time(dataset)

In [None]:
# Plot correlations between variables.
dataset_utils.plot_correlations(dataset)

# Plot words information

In [None]:
dataset = pd.read_csv(PREPROCESSED_DATASET_REVIEWS_PATH)
dataset['preprocessedReviewText'] = dataset['preprocessedReviewText'].fillna("") 
dataset

In [None]:
dataset.isna().sum()

In [None]:
# We print and plot the most common words of the reviews.
dataset_utils.plot_most_common_words(dataset)

In [None]:
negative_dataset = dataset[dataset['opinion'] == 'negative']
# We print and plot the most common words of the negative reviews.
dataset_utils.plot_most_common_words(negative_dataset, opinion="negative")

In [None]:
neutral_dataset = dataset[dataset['opinion'] == 'neutral']
# We print and plot the most common words of the neutral reviews.
dataset_utils.plot_most_common_words(neutral_dataset, opinion="neutral")

In [None]:
positive_dataset = dataset[dataset['opinion'] == 'positive']
# We print and plot the most common words of the positive reviews.
dataset_utils.plot_most_common_words(positive_dataset, opinion="positive")

In [None]:
# We plot the most common words for all the opinions.
dataset_utils.plot_intersection_most_common_words(negative_dataset, neutral_dataset, positive_dataset, k=20)

In [None]:
# We print the tokens that are present in a number of reviews that exceeds a threshold.
dataset_utils.find_frequent_tokens_in_reviews(dataset, threshold=0.9)

In [None]:
# We print the tokens that are present in a number of reviews that exceeds a threshold.
dataset_utils.find_frequent_tokens_in_reviews(dataset, threshold=0.2)