In [1]:
import json
from pathlib import Path
from typing import Dict

# Unzip Data

In [2]:
!unzip ./modcloth_data.json.zip
!unzip ./renttherunway_data_1.json.zip
!unzip ./renttherunway_data_2.json.zip

Archive:  ./modcloth_data.json.zip
  inflating: modcloth_data.json      
  inflating: __MACOSX/._modcloth_data.json  
Archive:  ./renttherunway_data_1.json.zip
  inflating: renttherunway_data_1.json  
  inflating: __MACOSX/._renttherunway_data_1.json  
Archive:  ./renttherunway_data_2.json.zip
  inflating: renttherunway_data_2.json  


In [3]:
RUNTHERUNWAY_1_FILE_PATH = Path('./renttherunway_data_1.json')
RUNTHERUNWAY_2_FILE_PATH = Path('./renttherunway_data_2.json')
MODCLOTH_FILE_PATH = Path('./modcloth_data.json')

# Load Runtherunway

In [4]:
runtherunway = {'data': {}, 'features': set()}
with open(RUNTHERUNWAY_1_FILE_PATH, 'r') as infile:
    for line in infile:
        review = json.loads(line)
        item_id = review['item_id']
        if item_id in runtherunway['data']:
            runtherunway['data'][item_id].append(review)
        else:
            runtherunway['data'][item_id] = [review]

        for key in review.keys():
            runtherunway['features'].add(key)

with open(RUNTHERUNWAY_2_FILE_PATH, 'r') as infile:
    for line in infile:
        review = json.loads(line)
        item_id = review['item_id']
        if item_id in runtherunway['data']:
            runtherunway['data'][item_id].append(review)
        else:
            runtherunway['data'][item_id] = [review]

        for key in review.keys():
            runtherunway['features'].add(key)

# Load Modcloth

In [5]:
modcloth = {'data': {}, 'features': set()}
with open(MODCLOTH_FILE_PATH, 'r') as infile:
    for row in infile:
        review = json.loads(row)
        item_id = review['item_id']
        if item_id in modcloth['data']:
            modcloth['data'][item_id].append(review)
        else:
            modcloth['data'][item_id] = [review]

        for key in review.keys():
            modcloth['features'].add(key)

# Data Features

In [6]:
common_features = runtherunway['features'].intersection(modcloth['features'])
print(f"runtherunway_features = {runtherunway['features']} {len(runtherunway['features'])}\n")
print(f"modcloth_features = {modcloth['features']} {len(modcloth['features'])}\n")
print(f"common_features = {common_features} {len(common_features)}")

runtherunway_features = {'bust size', 'age', 'item_id', 'rented for', 'user_id', 'category', 'review_text', 'review_date', 'review_summary', 'height', 'weight', 'fit', 'size', 'rating', 'body type'} 15

modcloth_features = {'cup size', 'item_id', 'bust', 'quality', 'user_id', 'category', 'review_text', 'fit', 'shoe width', 'waist', 'shoe size', 'review_summary', 'height', 'user_name', 'length', 'bra size', 'size', 'hips'} 18

common_features = {'item_id', 'user_id', 'category', 'review_text', 'review_summary', 'height', 'fit', 'size'} 8


# Data Count - Items and Reviews

In [7]:
runtherunway_reviews_count = 0
for item_id, reviews in runtherunway['data'].items():
    runtherunway_reviews_count += len(reviews)

modcloth_reviews_count = 0
for item_id, reviews in modcloth['data'].items():
    modcloth_reviews_count += len(reviews)

modcloth_items_count = len(modcloth['data'].keys())
runtherunway_items_count = len(runtherunway['data'].keys())
print(f'total items = {runtherunway_items_count} + {modcloth_items_count} = {modcloth_items_count + runtherunway_items_count}')
print(f'total reviews = {runtherunway_reviews_count} + {modcloth_reviews_count} = {runtherunway_reviews_count} + {modcloth_reviews_count}')

total items = 5850 + 1378 = 7228
total reviews = 192544 + 82790 = 192544 + 82790


# TODO - Types of 'fit' (Classifications)

# TODO - Does user's 'height' influence 'fit'?

# TODO - Does item's 'size' influence 'fit'?

# TODO - Does user's 'review_text' + 'review_summary' influence 'fit'?

# TODO - (Runtherunway) Does user's 'age' or 'height' affect 'fit'?

# TODO - (Modcloth) Does user's 'waist' or 'hips' affect 'fit'?