In [39]:
# global imports and helpers
import re
import gzip
import json
import pandas as pd
import numpy as np
from collections import defaultdict

### Load JSON Datasets into Pandas Dataframe

In [25]:
# data loading helper functions
def load_json_gz(path):
    rows = []
    with gzip.open(path, "rt") as f:
        for line in f:
            rows.append(json.loads(line))
    return pd.DataFrame(rows)

In [30]:
# ModCloth dataset
modcloth = load_json_gz("data/modcloth_final_data.json.gz")

# RentTheRunway dataset
rtr = load_json_gz("data/renttherunway_final_data.json.gz")

In [35]:
# add dataset column to both df
rtr["dataset"] = "renttherunway"
modcloth["dataset"] = "modcloth"

In [33]:
rtr.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date,dataset
0,fit,420272,34d,2260466,137lbs,10,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28,"April 20, 2016",renttherunway
1,fit,273551,34b,153475,132lbs,10,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36,"June 18, 2013",renttherunway
2,fit,360448,,1063761,,10,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116,"December 14, 2015",renttherunway
3,fit,909926,34c,126335,135lbs,8,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34,"February 12, 2014",renttherunway
4,fit,151944,34b,616682,145lbs,10,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27,"September 26, 2016",renttherunway


In [59]:
len(rtr)

192544

In [34]:
modcloth.head()

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text,dataset
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,Emily,just right,small,991571,,,,,modcloth
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,,modcloth
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,,modcloth
3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,,modcloth
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,,modcloth


In [60]:
len(modcloth)

82790

### Data Cleaning

In [40]:
# standardize height
def parse_height(h):
    if not isinstance(h, str):
        return None
    match = re.match(r"(\d+)'?\s*(\d+)", h)
    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        return feet * 12 + inches
    return None

rtr["height_inches"] = rtr["height"].apply(parse_height)
modcloth["height_inches"] = modcloth["height"].apply(parse_height)

In [43]:
# convert weight: "137lbs" -> 137
def parse_weight(w):
    if not isinstance(w, str):
        return None
    return int(w.replace("lbs", "").strip())

rtr["weight_lbs"] = rtr["weight"].apply(parse_weight)

In [48]:
#Convert bust size: "34d" -> band=34, cup=d
def parse_bust(b):
    if not isinstance(b, str):
        return None
    match = re.match(r"(\d+)([a-zA-Z]+)", b)
    if match:
        return int(match.group(1)), match.group(2).lower()
    return None

rtr["bust_band"] = rtr["bust size"].apply(lambda x: parse_bust(x)[0] if parse_bust(x) else None)
rtr["bust_cup"] = rtr["bust size"].apply(lambda x: parse_bust(x)[1] if parse_bust(x) else None)

# rename Modcloth columns to match
modcloth = modcloth.rename(columns={"cup size": "bust_cup", "bra size":"bust_band"})

In [50]:
# convert fit labels to a binary
def label_fit(x):
    if x == "fit":
        return 1
    else:
        return 0

rtr["fit_label"] = rtr["fit"].apply(label_fit)
modcloth["fit_label"] = modcloth["fit"].apply(label_fit)

#### Select shared & cleaned fields

In [51]:
modcloth.columns

Index(['item_id', 'waist', 'size', 'quality', 'bust_cup', 'hips', 'bust_band',
       'category', 'bust', 'height', 'user_name', 'length', 'fit', 'user_id',
       'shoe size', 'shoe width', 'review_summary', 'review_text', 'dataset',
       'height_inches', 'fit_label'],
      dtype='object')

In [52]:
rtr.columns

Index(['fit', 'user_id', 'bust size', 'item_id', 'weight', 'rating',
       'rented for', 'review_text', 'body type', 'review_summary', 'category',
       'height', 'size', 'age', 'review_date', 'dataset', 'height_inches',
       'weight_lbs', 'bust_band', 'bust_cup', 'fit_label'],
      dtype='object')

In [57]:
# combine the cleaned datasets

shared_columns = [
    "height_inches",
    "bust_band",
    "bust_cup",
    "fit_label",
    "review_text",
    "dataset" 
]

rtr_clean = rtr[shared_columns]
mod_clean = modcloth[shared_columns]

combined = pd.concat([rtr_clean, mod_clean], ignore_index=True)

  combined = pd.concat([rtr_clean, mod_clean], ignore_index=True)


In [58]:
combined

Unnamed: 0,height_inches,bust_band,bust_cup,fit_label,review_text,dataset
0,68.0,34.0,d,1,An adorable romper! Belt and zipper were a lit...,renttherunway
1,66.0,34.0,b,1,I rented this dress for a photo shoot. The the...,renttherunway
2,64.0,,,1,This hugged in all the right places! It was a ...,renttherunway
3,65.0,34.0,c,1,I rented this for my company's black tie award...,renttherunway
4,69.0,34.0,b,1,I have always been petite in my upper body and...,renttherunway
...,...,...,...,...,...,...
275329,,36,b,1,Cute jacket!,modcloth
275330,,34,ddd/f,0,It's a beautiful jacket. I love how it's knit ...,modcloth
275331,,32,dddd/g,1,I love this blazer. It is a great office piece...,modcloth
275332,,,,1,I love this blazer!! I wore it yesterday and g...,modcloth
