# Products Analysis

In [None]:
import pandas as pd
from reviews.config import processed_data_dir

prod_df = pd.read_json(processed_data_dir / "meta_digital_cameras.json.gz")

In [None]:
prod_df.info()

In [None]:
prod_df.head(10)

In [None]:
# remove possible html tags
from reviews.preprocess import strip_html
for attr in prod_df.columns:
    prod_df[attr] = prod_df[attr].astype("string").apply(strip_html)

In [None]:
# transform category and description from list of string to string
prod_df["category"] = prod_df["category"].transform(lambda x: ' '.join(x)).astype("string")
prod_df["description"] = prod_df["description"].transform(lambda x: ' '.join(x)).astype("string")

# convert price to float
# remove invalid prices (html broken)
# set to zero all invalid prices
prod_df["price"].loc[prod_df["price"].astype("string").str.len() > 20] = '-1'
prod_df["price"].loc[prod_df["price"].astype("string").str.len() == 0] = '-1'
prod_df["price"] = prod_df["price"].astype("string").str.replace('$', '').str.replace(',', '').astype("float")

# transform to string
prod_df["title"] = prod_df["title"].astype("string")
prod_df["date"] = prod_df["date"].astype("string")
prod_df["asin"] = prod_df["asin"].astype("string")

# transform to category
prod_df["category"] = prod_df["category"].astype("category")
prod_df["brand"] = prod_df["brand"].astype("category")
prod_df["main_cat"] = prod_df["main_cat"].astype("category")

In [None]:
# remove duplicates
prod_df.drop_duplicates(inplace=True)

In [None]:
# count empty attributes

empty_category = prod_df.loc[prod_df["category"] == '']
empty_brand = prod_df.loc[prod_df["brand"] == '']
empty_main_cat = prod_df.loc[prod_df["main_cat"] == '']

empty_description = prod_df.loc[prod_df["description"].str.len() == 0]
empty_title = prod_df.loc[prod_df["title"].str.len() == 0]
empty_date = prod_df.loc[prod_df["date"].str.len() == 0]
empty_price = prod_df.loc[prod_df["price"] == -1]
empty_asin = prod_df.loc[prod_df["asin"].str.len() == 0]

counts_empty_fields = [
    len(empty_category),
    len(empty_description),
    len(empty_title),
    len(empty_brand),
    len(empty_main_cat),
    len(empty_date),
    len(empty_price),
    len(empty_asin)
]

print(dict(zip(list(prod_df.columns), counts_empty_fields)))

In [None]:
# drop price column
prod_df.drop('price', axis=1, inplace=True)

# remove all rows for empty (brand, main_cat, date)
prod_df = prod_df.loc[prod_df["brand"] != '']
prod_df = prod_df.loc[prod_df["main_cat"] != '']
prod_df = prod_df.loc[prod_df["date"] != '']

In [None]:
# count empty attributes
empty_category = prod_df.loc[prod_df["category"] == '']
empty_brand = prod_df.loc[prod_df["brand"] == '']
empty_main_cat = prod_df.loc[prod_df["main_cat"] == '']

empty_description = prod_df.loc[prod_df["description"].str.len() == 0]
empty_title = prod_df.loc[prod_df["title"].str.len() == 0]
empty_date = prod_df.loc[prod_df["date"].str.len() == 0]
empty_asin = prod_df.loc[prod_df["asin"].str.len() == 0]

counts_empty_fields = [
    len(empty_category),
    len(empty_description),
    len(empty_title),
    len(empty_brand),
    len(empty_main_cat),
    len(empty_date),
    len(empty_asin)
]

print(dict(zip(list(prod_df.columns), counts_empty_fields)))

In [None]:
prod_df.head(20)

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

c = Counter(list(prod_df['brand']))
w = WordCloud(width=800, height=400, background_color="white").fit_words(c)
w.to_image()

In [None]:
prod_df['main_cat'].astype('string').value_counts()

In [None]:
prod_df['category'].astype('string').value_counts()