In [13]:
##################################################
# Imports & utilities                            #
##################################################

import gzip
import json
import random
from collections import defaultdict
import math
import string

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [14]:
##################################################
# Load dataset                                   #
##################################################


def read_rtr(path):
    with gzip.open(path, "rt") as f:
        for line in f:
            yield json.loads(line)


DATA_PATH = "renttherunway_final_data.json.gz"

data = list(read_rtr(DATA_PATH))
print("Total records:", len(data))

Total records: 192544


In [15]:
##################################################
# Train / validation split                       #
##################################################

indices = list(range(len(data)))
random.shuffle(indices)
split = int(0.8 * len(indices))
train_idx = set(indices[:split])
valid_idx = set(indices[split:])

train_data = [data[i] for i in train_idx]
valid_data = [data[i] for i in valid_idx]

print("Train size:", len(train_data))
print("Valid size:", len(valid_data))

Train size: 154035
Valid size: 38509


In [16]:
##################################################
# Shared helpers                                 #
##################################################


def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer / denom
    return 0.0

In [17]:
##################################################
# Shared helpers                                 #
##################################################


def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer / denom
    return 0.0

Rating Prediction  
* we must regularize a (destabilizes β updates)

In [18]:
#########################
# 1. Rating prediction  #
#########################
# r ≈ alpha + beta_u + beta_i  (user/item bias model)


# extract ratings from dataset and build rating triples (u, i, r) from data
def extract_ratings(dataset):
    ratings = []
    for d in dataset:
        if "rating" in d and d["rating"] not in (None, "", "nan"):
            try:
                r = float(d["rating"])
            except ValueError:
                continue
            u = d["user_id"]
            i = d["item_id"]
            ratings.append((u, i, r))
    return ratings


ratingsTrain = extract_ratings(train_data)
ratingsValid = extract_ratings(valid_data)

print("Train ratings:", len(ratingsTrain))
print("Valid ratings:", len(ratingsValid))

# Build per-user / per-item maps
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u, b, r in ratingsTrain:
    ratingsPerUser[u].append((b, r))
    ratingsPerItem[b].append((u, r))

Train ratings: 153976
Valid ratings: 38486


In [19]:
# getting the global average rating
def getGlobalAverage(trainRatings):
    return sum(r for (_, _, r) in trainRatings) / len(trainRatings)


######################
# Bias Model Updates #
######################


# improving alpha update with regularization
def alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb):
    newAlpha = 0.0
    for u, b, r in ratingsTrain:
        newAlpha += r - (betaU.get(u, 0.0) + betaI.get(b, 0.0))
    return newAlpha / (len(ratingsTrain) + lamb)


def betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb):
    newBetaU = {}
    for u in ratingsPerUser:
        num = 0.0
        for b, r in ratingsPerUser[u]:
            num += r - (alpha + betaI.get(b, 0.0))
        newBetaU[u] = num / (lamb + len(ratingsPerUser[u]))
    return newBetaU


def betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb):
    newBetaI = {}
    for b in ratingsPerItem:
        num = 0.0
        for u, r in ratingsPerItem[b]:
            num += r - (alpha + betaU.get(u, 0.0))
        newBetaI[b] = num / (lamb + len(ratingsPerItem[b]))
    return newBetaI


###################
# MSE COMPUTATION #
###################


def msePlusReg(ratingsTrain, alpha, betaU, betaI, lamb):
    mse = 0.0
    for u, b, r in ratingsTrain:
        pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
        mse += (r - pred) ** 2
    mse /= len(ratingsTrain)

    # regularization penalty
    reg = sum(b**2 for b in betaU.values()) + sum(b**2 for b in betaI.values())
    return mse, mse + lamb * reg


def validMSE(ratingsValid, alpha, betaU, betaI):
    mse = 0.0
    for u, b, r in ratingsValid:
        pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
        mse += (r - pred) ** 2
    mse /= len(ratingsValid)
    return mse


##################
# TRAINING LOOP  #
##################


def train_bias_model(ratingsTrain, ratingsPerUser, ratingsPerItem, lamb=1.0, iters=30):
    alpha = getGlobalAverage(ratingsTrain)
    betaU = defaultdict(float)
    betaI = defaultdict(float)

    for _ in range(iters):
        alpha = alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb)

        newBetaU = betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb)
        newBetaI = betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb)

        betaU.update(newBetaU)  # refine existing estimates, not reset
        betaI.update(newBetaI)
    return alpha, betaU, betaI

In [20]:
# Hyperparameter search for best lambda
# tuning lambda:
for lamb in [0.1, 0.3, 1, 3, 10]:
    alpha_tmp, bu_tmp, bi_tmp = train_bias_model(
        ratingsTrain, ratingsPerUser, ratingsPerItem, lamb=lamb, iters=30
    )
    print(
        f"λ={lamb}  Valid MSE={validMSE(ratingsValid, alpha_tmp, bu_tmp, bi_tmp):.4f}"
    )

# --- Pick λ manually after seeing results ---
best_lambda = 10.0

alpha, betaU, betaI = train_bias_model(
    ratingsTrain, ratingsPerUser, ratingsPerItem, lamb=best_lambda, iters=30
)

# final metrics
train_mse, train_obj = msePlusReg(ratingsTrain, alpha, betaU, betaI, lamb=best_lambda)
valid_mse = validMSE(ratingsValid, alpha, betaU, betaI)

print("Rating prediction:")
print("  Train MSE:", train_mse)
print("  Valid MSE:", valid_mse)

λ=0.1  Valid MSE=2.4250
λ=0.3  Valid MSE=2.2754
λ=1  Valid MSE=2.0741
λ=3  Valid MSE=1.9540
λ=10  Valid MSE=1.9187
Rating prediction:
  Train MSE: 1.5613052446946623
  Valid MSE: 1.9186764817339022


In [21]:
# ITS DEFINITELY OPTIONAL


def writePredictionsRating(alpha, betaU, betaI, in_pairs_path, out_path):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
            predictions.write(u + "," + b + "," + str(pred) + "\n")


In [23]:
import string

punctuation = set(string.punctuation)


def clean_text(s):
    if s is None:
        return ""
    s = s.lower()
    return "".join(c for c in s if c not in punctuation)


def get_full_text(d):
    """
    Combine review_text and review_summary into a single cleaned string.
    """
    parts = []
    txt = d.get("review_text")
    summ = d.get("review_summary")  # may be missing

    if txt:
        parts.append(clean_text(txt))
    if summ:
        parts.append(clean_text(summ))

    return " ".join(parts)


In [24]:
##################################################
# Task 2 – Category / Event Prediction       #
#            (text → rented for)                #
##################################################

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import numpy as np

# IMPORTANT: use the actual key in the data
label_field = "rented for"  # key in the JSON
top_k = 10  # keep only the top-k most common labels


def collect_labels(dataset):
    labels = []
    for d in dataset:
        lab = d.get(label_field)
        if lab in (None, "", "nan"):
            continue
        labels.append(lab)
    return labels


# --- 1. Find top-k most common "rented for" labels on train set ---
all_train_labels = collect_labels(train_data)
label_counts = Counter(all_train_labels)

print("Rented-for prediction:")
print("  Total labeled train examples:", len(all_train_labels))
print("  Label counts (top 10):", label_counts.most_common(10))

top_k_labels = [lab for lab, _ in label_counts.most_common(top_k)]
print("  Top", top_k, "labels:", top_k_labels)


def extract_texts_and_rented_for(dataset, allowed_labels):
    texts = []
    labels = []
    for d in dataset:
        lab = d.get(label_field)
        if lab in (None, "", "nan"):
            continue
        if lab not in allowed_labels:  # drop rare labels
            continue
        texts.append(get_full_text(d))  # reuse review_text + summary
        labels.append(lab)
    return texts, np.array(labels)


# --- 2. Build train/valid sets restricted to top-k labels ---

train_texts_rf, y_rf_train = extract_texts_and_rented_for(train_data, top_k_labels)
valid_texts_rf, y_rf_valid = extract_texts_and_rented_for(valid_data, top_k_labels)

print("  Train samples (top-k only):", len(train_texts_rf))
print("  Valid samples (top-k only):", len(valid_texts_rf))

if len(train_texts_rf) == 0:
    raise ValueError(
        "No training samples found for 'rented for'. "
        "Check that label_field matches the key in your data."
    )

print("  Unique rented-for labels (train):", len(np.unique(y_rf_train)))

# --- 3. TF-IDF features (separate vectorizer for this task) ---

rf_vectorizer = TfidfVectorizer(
    max_features=12000,  # a bit richer than 8000
    ngram_range=(1, 2),  # unigrams + bigrams
    min_df=3,  # drop extremely rare tokens
    sublinear_tf=True,
)

X_rf_train = rf_vectorizer.fit_transform(train_texts_rf)
X_rf_valid = rf_vectorizer.transform(valid_texts_rf)

print("  X_rf_train shape:", X_rf_train.shape)

# --- 4. Encode rented-for labels ---

rf_le = LabelEncoder()
all_rf_labels = np.concatenate([y_rf_train, y_rf_valid])
rf_le.fit(all_rf_labels)

y_rf_train_enc = rf_le.transform(y_rf_train)
y_rf_valid_enc = rf_le.transform(y_rf_valid)

# --- 5. Hyperparameter search for best C ---

Cs = [0.5, 1.0, 2.0, 3.0, 5.0]
best_C = None
best_valid_acc = -1.0
best_clf = None

print("\nTuning C:")
for C in Cs:
    clf = LogisticRegression(
        max_iter=300,
        solver="saga",
        n_jobs=-1,
        C=C,
        random_state=0,
    )
    clf.fit(X_rf_train, y_rf_train_enc)
    train_pred = clf.predict(X_rf_train)
    valid_pred = clf.predict(X_rf_valid)

    train_acc = (train_pred == y_rf_train_enc).mean()
    valid_acc = (valid_pred == y_rf_valid_enc).mean()

    print(f"  C={C:<4}  Train={train_acc:.4f}  Valid={valid_acc:.4f}")

    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_C = C
        best_clf = clf

print(f"\nBest C: {best_C}  (valid accuracy = {best_valid_acc:.4f})")

# --- 6. Evaluate best model: accuracy + per-class metrics ---

rf_train_pred = best_clf.predict(X_rf_train)
rf_valid_pred = best_clf.predict(X_rf_valid)

rf_train_acc = (rf_train_pred == y_rf_train_enc).mean()
rf_valid_acc = (rf_valid_pred == y_rf_valid_enc).mean()

print("\nFinal model (rented-for prediction):")
print("  Train accuracy:", rf_train_acc)
print("  Valid accuracy:", rf_valid_acc)

# Decode predictions back to label names for nice reports
y_rf_valid_pred_labels = rf_le.inverse_transform(rf_valid_pred)

print("\nClassification report (valid):")
print(classification_report(y_rf_valid, y_rf_valid_pred_labels, digits=3))


Rented-for prediction:
  Total labeled train examples: 154029
  Label counts (top 10): [('wedding', 46303), ('formal affair', 32269), ('party', 28465), ('everyday', 13535), ('other', 12273), ('work', 12035), ('date', 5906), ('vacation', 3242), ('party: cocktail', 1)]
  Top 10 labels: ['wedding', 'formal affair', 'party', 'everyday', 'other', 'work', 'date', 'vacation', 'party: cocktail']
  Train samples (top-k only): 154029
  Valid samples (top-k only): 38505
  Unique rented-for labels (train): 9
  X_rf_train shape: (154029, 12000)

Tuning C:
  C=0.5   Train=0.6509  Valid=0.5986
  C=1.0   Train=0.6672  Valid=0.5972
  C=2.0   Train=0.6835  Valid=0.5927
  C=3.0   Train=0.6914  Valid=0.5883
  C=5.0   Train=0.7013  Valid=0.5808

Best C: 0.5  (valid accuracy = 0.5986)

Final model (rented-for prediction):
  Train accuracy: 0.6509423550110693
  Valid accuracy: 0.5986235553824178

Classification report (valid):
               precision    recall  f1-score   support

         date      0.677  

In [25]:
def writePredictionsCategory(
    model, words, wordId, wordSet, label_encoder, in_pairs_path, out_path
):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        pos = 0
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            # You would need to look up the review_text for (u, b) here.
            # Placeholder: empty text → all zeros except bias.
            feat = [0] * len(words) + [1]
            pred_label = model.predict(np.array(feat).reshape(1, -1))[0]
            pred_cat = label_encoder.inverse_transform([pred_label])[0]
            predictions.write(u + "," + b + "," + str(pred_cat) + "\n")
            pos += 1

In [26]:
def get_text_for_category(d):
    """
    Build the text used for both category and fit models.
    Uses review_text and (optionally) review_summary if it exists.
    """
    parts = []
    txt = d.get("review_text")
    summ = d.get("review_summary")  # it's okay if this field doesn't exist

    if txt:
        parts.append(clean_text(txt))
    if summ:
        parts.append(clean_text(summ))

    return " ".join(parts)


In [29]:
##################################################
# 5. Task 3 – Fit Prediction (hybrid features)   #
#    height + weight + size + bodytype           #
#    + bust size + category  → fit               #
##################################################

import re
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, f1_score

# --------- helper parsers for numeric fields ---------


def parse_height_to_inches(s):
    """Convert '5\' 4\"' to inches, or return NaN."""
    if s is None or s in ("", "nan", "Missing value"):
        return np.nan
    try:
        m = re.match(r"(\d+)\s*'\s*(\d*)", s)
        if not m:
            return np.nan
        feet = int(m.group(1))
        inches = int(m.group(2) or 0)
        return feet * 12 + inches
    except Exception:
        return np.nan


def parse_weight_to_lbs(s):
    """Convert '137lbs' → 137."""
    if s is None or s in ("", "nan", "Missing value"):
        return np.nan
    try:
        digits = "".join(ch for ch in s if ch.isdigit())
        return float(digits) if digits else np.nan
    except Exception:
        return np.nan


def safe_float(s):
    if s is None or s in ("", "nan", "Missing value"):
        return np.nan
    try:
        return float(s)
    except Exception:
        return np.nan


# --------- extract hybrid feature set ---------


def extract_fit_features_hybrid(dataset):
    """
    Features:
      numeric: height (in), weight (lbs), size
      categorical: body type, bust size, category
    Label:
      fit ∈ {fit, small, large}
    """
    rows = []
    labels = []

    for d in dataset:
        fit = d.get("fit")
        if fit in (None, "", "nan"):
            continue  # need label

        height_in = parse_height_to_inches(d.get("height"))
        weight_lb = parse_weight_to_lbs(d.get("weight"))
        size_num = safe_float(d.get("size"))

        body_type = d.get("body type") or "Unknown"
        bust_size = d.get("bust size") or "Unknown"
        category = d.get("category") or "Unknown"

        rows.append([height_in, weight_lb, size_num, body_type, bust_size, category])
        labels.append(fit)

    return np.array(rows, dtype=object), np.array(labels)


# Build train/valid sets
X_fit_train, y_fit_train = extract_fit_features_hybrid(train_data)
X_fit_valid, y_fit_valid = extract_fit_features_hybrid(valid_data)

print("Hybrid fit prediction (h, w, size, body type, bust size, category):")
print("  Train samples:", len(X_fit_train))
print("  Valid samples:", len(X_fit_valid))

# column indices
numeric_cols = [0, 1, 2]  # height_in, weight_lb, size_num
categorical_cols = [3, 4, 5]  # body_type, bust_size, category

# Preprocessor: impute missing + one-hot encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_cols),
        (
            "cat",
            Pipeline(
                [
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore")),
                ]
            ),
            categorical_cols,
        ),
    ]
)

# Encode fit labels
fit_le = LabelEncoder()
y_fit_train_enc = fit_le.fit_transform(y_fit_train)
y_fit_valid_enc = fit_le.transform(y_fit_valid)

print("  Fit classes:", fit_le.classes_)

# Baseline: always predict majority class
(unique, counts) = np.unique(y_fit_train, return_counts=True)
majority_label = unique[np.argmax(counts)]
baseline_acc = (y_fit_valid == majority_label).mean()
print(f"  Baseline (always '{majority_label}') valid accuracy: {baseline_acc:.4f}")

# --------- Hyperparameter search for best C (accuracy-focused) ---------

Cs = [0.5, 1.0, 2.0, 3.0, 5.0]
best_C = None
best_valid_acc = -1.0
best_model = None
best_macro_f1 = None

print("\nTuning C (maximize accuracy, watch macro F1):")
for C in Cs:
    model = Pipeline(
        [
            ("prep", preprocessor),
            (
                "clf",
                LogisticRegression(
                    max_iter=400,
                    solver="lbfgs",  # good for small/medium feature spaces
                    C=C,
                    n_jobs=-1,
                    random_state=0,
                    multi_class="auto",
                ),
            ),
        ]
    )

    model.fit(X_fit_train, y_fit_train_enc)
    train_pred = model.predict(X_fit_train)
    valid_pred = model.predict(X_fit_valid)

    train_acc = (train_pred == y_fit_train_enc).mean()
    valid_acc = (valid_pred == y_fit_valid_enc).mean()
    macro_f1 = f1_score(y_fit_valid_enc, valid_pred, average="macro")

    print(
        f"  C={C:<4}  Train={train_acc:.4f}  Valid={valid_acc:.4f}  MacroF1={macro_f1:.4f}"
    )

    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_C = C
        best_model = model
        best_macro_f1 = macro_f1

print(
    f"\nBest C: {best_C}  (valid accuracy = {best_valid_acc:.4f}, macro F1 = {best_macro_f1:.4f})"
)

# --------- Final evaluation with best model ---------

train_pred = best_model.predict(X_fit_train)
valid_pred = best_model.predict(X_fit_valid)

train_acc = (train_pred == y_fit_train_enc).mean()
valid_acc = (valid_pred == y_fit_valid_enc).mean()

print("\nFinal hybrid model (→ fit):")
print("  Train accuracy:", train_acc)
print("  Valid accuracy:", valid_acc)

print("\nClassification report (valid set):")
print(
    classification_report(
        y_fit_valid_enc, valid_pred, target_names=fit_le.classes_, digits=3
    )
)


Hybrid fit prediction (h, w, size, body type, bust size, category):
  Train samples: 154035
  Valid samples: 38509
  Fit classes: ['fit' 'large' 'small']
  Baseline (always 'fit') valid accuracy: 0.7356

Tuning C (maximize accuracy, watch macro F1):




  C=0.5   Train=0.7372  Valid=0.7344  MacroF1=0.2836




  C=1.0   Train=0.7370  Valid=0.7342  MacroF1=0.2838




  C=2.0   Train=0.7370  Valid=0.7344  MacroF1=0.2842




  C=3.0   Train=0.7371  Valid=0.7344  MacroF1=0.2839




  C=5.0   Train=0.7369  Valid=0.7343  MacroF1=0.2842

Best C: 2.0  (valid accuracy = 0.7344, macro F1 = 0.2842)

Final hybrid model (→ fit):
  Train accuracy: 0.7370208069594573
  Valid accuracy: 0.7344257186631696

Classification report (valid set):
              precision    recall  f1-score   support

         fit      0.736     0.998     0.847     28329
       large      0.000     0.000     0.000      4978
       small      0.172     0.003     0.006      5202

    accuracy                          0.734     38509
   macro avg      0.303     0.334     0.284     38509
weighted avg      0.565     0.734     0.624     38509



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
