In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import re

from reviews.preprocess import preprocess
from reviews.config import processed_data_dir

# Read Dataset
- to fix {'category': 0, 'description': 1632, 'title': 0, 'brand': 0, 'main_cat': 0, 'date': 0, 'asin': 0, 'overall': 0, 'vote': 0, 'text': 0, 'summary': 1, 'timestamp': 0}

In [None]:
#load data
prod_df = pd.read_json(processed_data_dir / "meta_digital_cameras.json.gz")
review_df = pd.read_json(processed_data_dir / "reviews_digital_cameras.json.gz")

In [None]:
# merge data (inner mode) 
data_df = pd.merge(left=prod_df, right=review_df, on='asin')
data_df.info()

In [None]:
#head of merge data
data_df.head(10)

In [None]:
# remove possible html tags
from reviews.preprocess import strip_html
for attr in data_df.columns:
    data_df[attr] = data_df[attr].astype("string").apply(strip_html)

In [None]:
rgx = re.compile(r'\'(.*?)\'')

def transform_categories(text):
    text = rgx.sub(r'\g<1>', text.replace("[","").replace("]",""))
    # text = text.replace("Electronics, Camera & Photo, Digital Cameras", "")
    text = " ,".join([x for x in text.split(",") if x])
    return text

# transform category and description from list of string to a string
data_df["category"] = data_df["category"].transform(transform_categories)
data_df["description"] = data_df["description"].transform(transform_categories)

# convert price to float
# remove invalid prices (html broken)
# set to zero all invalid prices
data_df["price"].loc[data_df["price"].astype("string").str.len() > 20] = '-1'
data_df["price"].loc[data_df["price"].astype("string").str.len() == 0] = '-1'
data_df["price"] = data_df["price"].astype("string").str.replace('$', '').str.replace(',', '').astype("float") # non serve proprio!

# transform to string
data_df["title"] = data_df["title"].astype("string")
data_df["date"] = data_df["date"].astype("string") # non serve proprio!
data_df["asin"] = data_df["asin"].astype("string")
data_df["text"] = data_df["text"].astype("string")
data_df["summary"] = data_df["summary"].astype("string")

# transform to category
data_df["category"] = data_df["category"].astype("category")
data_df["brand"] = data_df["brand"].astype("category")
data_df["main_cat"] = data_df["main_cat"].astype("category")

In [None]:
# remove duplicates
data_df.drop_duplicates(inplace=True)

In [None]:
# count empty attributes

empty_category = data_df.loc[data_df["category"] == '']
empty_brand = data_df.loc[data_df["brand"] == '']
empty_main_cat = data_df.loc[data_df["main_cat"] == '']
empty_description = data_df.loc[data_df["description"].str.len() == 0]
empty_title = data_df.loc[data_df["title"].str.len() == 0]
empty_date = data_df.loc[data_df["date"].str.len() == 0]
empty_price = data_df.loc[data_df["price"] == -1]
empty_asin = data_df.loc[data_df["asin"].str.len() == 0]

empty_overall = data_df.loc[data_df["overall"] == '']
empty_vote = data_df.loc[data_df["vote"] == '']
empty_timestamps = data_df.loc[data_df["timestamp"] == '']
empty_text = data_df.loc[data_df["text"].str.len() == 0]
empty_summary = data_df.loc[data_df["summary"].str.len() == 0]
empty_asin = data_df.loc[data_df["asin"].str.len() == 0]

counts_empty_fields = [
    len(empty_category),
    len(empty_description),
    len(empty_title),
    len(empty_brand),
    len(empty_main_cat),
    len(empty_date),
    len(empty_price),
    len(empty_asin),
    len(empty_overall),
    len(empty_vote),
    len(empty_asin),
    len(empty_text),
    len(empty_summary)
]

print(dict(zip(list(data_df.columns), counts_empty_fields)))

In [None]:
# drop price column
data_df.drop('price', axis=1, inplace=True)

# remove all rows for empty (brand, main_cat, date)
data_df = data_df.loc[data_df["brand"] != '']
data_df = data_df.loc[data_df["main_cat"] != '']
data_df = data_df.loc[data_df["date"] != '']

In [None]:
# count empty attributes

empty_category = data_df.loc[data_df["category"] == '']
empty_brand = data_df.loc[data_df["brand"] == '']
empty_main_cat = data_df.loc[data_df["main_cat"] == '']
empty_description = data_df.loc[data_df["description"].str.len() == 0]
empty_title = data_df.loc[data_df["title"].str.len() == 0]
empty_date = data_df.loc[data_df["date"].str.len() == 0]
empty_asin = data_df.loc[data_df["asin"].str.len() == 0]

empty_overall = data_df.loc[data_df["overall"] == '']
empty_vote = data_df.loc[data_df["vote"] == '']
empty_timestamps = data_df.loc[data_df["timestamp"] == '']
empty_text = data_df.loc[data_df["text"].str.len() == 0]
empty_summary = data_df.loc[data_df["summary"].str.len() == 0]
empty_asin = data_df.loc[data_df["asin"].str.len() == 0]

counts_empty_fields = [
    len(empty_category),
    len(empty_description),
    len(empty_title),
    len(empty_brand),
    len(empty_main_cat),
    len(empty_date),
    len(empty_asin),
    len(empty_overall),
    len(empty_vote),
    len(empty_asin),
    len(empty_text),
    len(empty_summary)
]

print(dict(zip(list(data_df.columns), counts_empty_fields)))

In [None]:
data_df.head(10)

In [None]:
# number of reviews per product(ASIN)
data = pd.DataFrame(Counter(data_df['asin'].tolist()).most_common(50))
_, ax = plt.subplots(figsize=(40, 7))
plt.xticks(rotation=45)
sns.barplot(data[0], data[1], ax=ax)

# average number of reviews
z = pd.DataFrame(data_df['asin'].value_counts())
print(z['asin'].mean())

# Overall

In [None]:
# number by starts
data_df['overall'].value_counts(normalize=True).plot(kind = 'bar')

pass

In [None]:
# min and max of overall mean
print((data_df.groupby("asin")["overall"].mean()).min())
print((data_df.groupby("asin")["overall"].mean()).max())

# mean of overall
print(data_df.groupby("asin")["overall"].mean())

# Vote

In [None]:
# count of helpful vote number
from collections import Counter
from locale import normalize
import seaborn as sns
from matplotlib import pyplot as plt

data = pd.DataFrame(Counter(data_df['vote'].tolist()).most_common(40))
_, ax = plt.subplots(figsize=(25, 5))
sns.barplot(data[0], data[1], ax=ax)

pass

In [None]:
# min and max of vote mean
print((data_df.groupby("asin")["vote"].mean()).min())
print((data_df.groupby("asin")["vote"].mean()).max())

# mean of vote
print(data_df.groupby("asin")["vote"].mean())

# Brand Reviewed

### Most Reviewed Brand

In [None]:
data = pd.DataFrame(Counter(data_df['brand'].tolist()).most_common(20))
_, ax = plt.subplots(figsize=(25, 5))
sns.barplot(data[0], data[1], ax=ax)

pass

In [None]:
c = Counter(list(prod_df['brand']))
w = WordCloud(width=800, height=400, background_color="white").fit_words(c)
w.to_image()

### Mean Overall per Brand

In [None]:
# mean valuation for brand
print((data_df.groupby("brand")["overall"].mean()).sort_values())

### Mean Overall per Vote

In [None]:
# mean number of util vote for brand
print((data_df.groupby("brand")["vote"].mean()).sort_values())

# Review Time Series

In [None]:
_, ax = plt.subplots(figsize=(25, 5))
sns.histplot(data_df[data_df['brand'] == 'Canon']['timestamp'], ax=ax)

pass

In [None]:
_, ax = plt.subplots(figsize=(25, 5))
sns.histplot(data_df[data_df['brand'] == 'Nikon']['timestamp'], ax=ax)
pass

In [None]:
_, ax = plt.subplots(figsize=(25, 5))
sns.histplot(data_df[data_df['brand'] == 'Sony']['timestamp'], ax=ax)
pass

# Review Text and Summary

### Review Language Detction

In [None]:
# keep eng only reviews

from reviews.lang_identification import LanguageIdentification
lang_detector = LanguageIdentification()

for x in data_df['summary'].tolist():
    data_df['lang'] = lang_detector.predict_lang(x)

### Summary

In [None]:
data_df['summary'].head(10)

In [None]:
# most common tokens summary

tokens = []
for x in data_df["summary"].tolist():
    for token in preprocess(x, sentences = False): 
        tokens.append(token)

data = pd.DataFrame(Counter(tokens).most_common(20))
_, ax = plt.subplots(figsize=(25, 5))
sns.barplot(data[0], data[1], ax=ax)

pass

In [None]:
c = Counter(tokens)
w = WordCloud(width=800, height=400, background_color="white").fit_words(c)
w.to_image()

### Text

In [None]:
data_df['text'].head(10)

In [None]:
# most common tokens summary
tokens = []
for x in data_df["text"].tolist():
    for sentence in preprocess(x, sentences = True):
        for token in sentence:
            tokens.append(token)

data = pd.DataFrame(Counter(tokens).most_common(20))
_, ax = plt.subplots(figsize=(25, 5))
sns.barplot(data[0], data[1], ax=ax)

pass

In [None]:
c = Counter(tokens)
w = WordCloud(width=800, height=400, background_color="white").fit_words(c)
w.to_image()