In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import collections
from scipy import stats

%matplotlib inline

In [None]:
# Examining Product Metadata

In [None]:
PRODUCTS_FILEPATH = 'data/products.csv'

id_to_product = {}

with open(PRODUCTS_FILEPATH, 'r') as f:
    for i, row in enumerate(f):
        if i % 500 == 0:
            print(i)
        for product in json.loads(row):
            id_to_product[product['id']] = product

In [None]:
ids = list(id_to_product.keys())

id_to_category = {}
for product in id_to_product.values():
    for category in product['categories']:
        id_to_category[category['numId']] = category

category_ids_of_interest = set([
    70, # bracelets
    73, # necklaces
    75, # rings
    72, # earrings
])

category_to_count = collections.Counter()
category_to_products = collections.defaultdict(set)

i = 0
for product in id_to_product.values():
    category_ids = set(category['numId'] for category in product['categories'])
    ids_in_common = category_ids & category_ids_of_interest
    if len(ids_in_common) != 1:
        i += 1
    else:
        category_to_count[list(ids_in_common)[0]] += 1
        category_id = list(ids_in_common)[0]
        category_to_products[category_id].add(product['id'])
print(category_to_count)


# for category in id_to_category.values():
#     print(category)
        

In [None]:
items = pd.DataFrame.from_dict(id_to_product, orient='index')
items.info()

In [None]:
def split_category(item):
    for i in item:
        return(i['numId'])
    
items['category_numID'] = items['categories'].apply(split_category)
items.head()

In [None]:
plt.hist(items.price, bins=100); 
print(f'Minimum Price: {min(items.price)}')
print(f'Maximum Price: {max(items.price)}')

In [None]:
items_of_interest = items[items['category_numID'].isin(category_ids_of_interest)]
items_of_interest.boxplot('price', by='category_numID')

In [None]:
items_of_interest.groupby('category_numID')['price'].agg(['median', 'mean', 'min', 'max'])
items_of_interest['price'].hist(by=items_of_interest['category_numID'], bins = 100); 

In [None]:
F, p = stats.f_oneway(items_of_interest['price'][items_of_interest['category_numID'] == 70], 
             items_of_interest['price'][items_of_interest['category_numID'] == 72],
             items_of_interest['price'][items_of_interest['category_numID'] == 73],
            items_of_interest['price'][items_of_interest['category_numID'] == 75])
print(f'F-stat = {F}; p-val = {p}')