In [73]:
##################################################
# Imports & utilities                            #
##################################################

import gzip
import json
import random
from collections import defaultdict
import math
import string

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [74]:
##################################################
# Load dataset                                   #
##################################################


def read_rtr(path):
    with gzip.open(path, "rt") as f:
        for line in f:
            yield json.loads(line)


DATA_PATH = "renttherunway_final_data.json.gz"

data = list(read_rtr(DATA_PATH))
print("Total records:", len(data))

Total records: 192544


In [44]:
##################################################
# Train / validation split                       #
##################################################

indices = list(range(len(data)))
random.shuffle(indices)
split = int(0.8 * len(indices))
train_idx = set(indices[:split])
valid_idx = set(indices[split:])

train_data = [data[i] for i in train_idx]
valid_data = [data[i] for i in valid_idx]

print("Train size:", len(train_data))
print("Valid size:", len(valid_data))

Train size: 154035
Valid size: 38509


In [45]:
##################################################
# Shared helpers                                 #
##################################################


def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer / denom
    return 0.0

In [46]:
##################################################
# Shared helpers                                 #
##################################################


def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer / denom
    return 0.0

Rating Prediction  
* we must regularize a (destabilizes β updates)

In [47]:
#########################
# 1. Rating prediction  #
#########################
# r ≈ alpha + beta_u + beta_i  (user/item bias model)


# extract ratings from dataset and build rating triples (u, i, r) from data
def extract_ratings(dataset):
    ratings = []
    for d in dataset:
        if "rating" in d and d["rating"] not in (None, "", "nan"):
            try:
                r = float(d["rating"])
            except ValueError:
                continue
            u = d["user_id"]
            i = d["item_id"]
            ratings.append((u, i, r))
    return ratings


ratingsTrain = extract_ratings(train_data)
ratingsValid = extract_ratings(valid_data)

print("Train ratings:", len(ratingsTrain))
print("Valid ratings:", len(ratingsValid))

# Build per-user / per-item maps
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u, b, r in ratingsTrain:
    ratingsPerUser[u].append((b, r))
    ratingsPerItem[b].append((u, r))

Train ratings: 153971
Valid ratings: 38491


In [48]:
# getting the global average rating
def getGlobalAverage(trainRatings):
    return sum(r for (_, _, r) in trainRatings) / len(trainRatings)


######################
# Bias Model Updates #
######################


# improving alpha update with regularization
def alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb):
    newAlpha = 0.0
    for u, b, r in ratingsTrain:
        newAlpha += r - (betaU.get(u, 0.0) + betaI.get(b, 0.0))
    return newAlpha / (len(ratingsTrain) + lamb)


def betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb):
    newBetaU = {}
    for u in ratingsPerUser:
        num = 0.0
        for b, r in ratingsPerUser[u]:
            num += r - (alpha + betaI.get(b, 0.0))
        newBetaU[u] = num / (lamb + len(ratingsPerUser[u]))
    return newBetaU


def betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb):
    newBetaI = {}
    for b in ratingsPerItem:
        num = 0.0
        for u, r in ratingsPerItem[b]:
            num += r - (alpha + betaU.get(u, 0.0))
        newBetaI[b] = num / (lamb + len(ratingsPerItem[b]))
    return newBetaI


###################
# MSE COMPUTATION #
###################


def msePlusReg(ratingsTrain, alpha, betaU, betaI, lamb):
    mse = 0.0
    for u, b, r in ratingsTrain:
        pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
        mse += (r - pred) ** 2
    mse /= len(ratingsTrain)

    # regularization penalty
    reg = sum(b**2 for b in betaU.values()) + sum(b**2 for b in betaI.values())
    return mse, mse + lamb * reg


def validMSE(ratingsValid, alpha, betaU, betaI):
    mse = 0.0
    for u, b, r in ratingsValid:
        pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
        mse += (r - pred) ** 2
    mse /= len(ratingsValid)
    return mse


##################
# TRAINING LOOP  #
##################


def train_bias_model(ratingsTrain, ratingsPerUser, ratingsPerItem, lamb=1.0, iters=30):
    alpha = getGlobalAverage(ratingsTrain)
    betaU = defaultdict(float)
    betaI = defaultdict(float)

    for _ in range(iters):
        alpha = alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb)

        newBetaU = betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb)
        newBetaI = betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb)

        betaU.update(newBetaU)  # refine existing estimates, not reset
        betaI.update(newBetaI)
    return alpha, betaU, betaI

In [49]:
# Hyperparameter search for best lambda
# tuning lambda:
for lamb in [0.1, 0.3, 1, 3, 10]:
    alpha_tmp, bu_tmp, bi_tmp = train_bias_model(
        ratingsTrain, ratingsPerUser, ratingsPerItem, lamb=lamb, iters=30
    )
    print(
        f"λ={lamb}  Valid MSE={validMSE(ratingsValid, alpha_tmp, bu_tmp, bi_tmp):.4f}"
    )

# --- Pick λ manually after seeing results ---
best_lambda = 10.0

alpha, betaU, betaI = train_bias_model(
    ratingsTrain, ratingsPerUser, ratingsPerItem, lamb=best_lambda, iters=30
)

# final metrics
train_mse, train_obj = msePlusReg(ratingsTrain, alpha, betaU, betaI, lamb=best_lambda)
valid_mse = validMSE(ratingsValid, alpha, betaU, betaI)

print("Rating prediction:")
print("  Train MSE:", train_mse)
print("  Valid MSE:", valid_mse)

λ=0.1  Valid MSE=2.3675
λ=0.3  Valid MSE=2.2266
λ=1  Valid MSE=2.0357
λ=3  Valid MSE=1.9229
λ=10  Valid MSE=1.8926
Rating prediction:
  Train MSE: 1.567066834286779
  Valid MSE: 1.8925909950809328


In [50]:
# ITS DEFINITELY OPTIONAL


def writePredictionsRating(alpha, betaU, betaI, in_pairs_path, out_path):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
            predictions.write(u + "," + b + "," + str(pred) + "\n")


In [51]:
##################################################
# 2. Rental prediction (Read → Rent)             #
##################################################

# Treat every observed (user_id, item_id) as a positive "rented" interaction.

all_rentals = set()
userSet = set()
itemSet = set()

for d in data:
    u = d["user_id"]
    b = d["item_id"]
    userSet.add(u)
    itemSet.add(b)
    all_rentals.add((u, b))

userList = sorted(list(userSet))
itemList = sorted(list(itemSet))

In [52]:
# Build positives from validation ratings-style data or from valid_data directly.
# We'll just use valid_data as the source of positive rentals.
readValid = set()
for d in valid_data:
    readValid.add((d["user_id"], d["item_id"]))

# Generate negative samples: one not-rented item per positive pair
notRead = set()
for u, b in readValid:
    b_neg = random.choice(itemList)
    while (u, b_neg) in all_rentals or (u, b_neg) in notRead:
        b_neg = random.choice(itemList)
    notRead.add((u, b_neg))

print("Rental prediction validation:")
print("  Positives:", len(readValid))
print("  Negatives:", len(notRead))

# Popularity counts (on train_data)
itemCount = defaultdict(int)
for d in train_data:
    itemCount[d["item_id"]] += 1

totalRead = sum(itemCount.values())
mostPopular = sorted([(c, b) for b, c in itemCount.items()], reverse=True)

Rental prediction validation:
  Positives: 38498
  Negatives: 38498


In [53]:
def baseLineStrategy(mostPopular, totalRead):
    chosen = set()
    count = 0
    for c, b in mostPopular:
        count += c
        chosen.add(b)
        if count > totalRead / 2:
            break
    return chosen


def improvedStrategy(mostPopular, totalRead):
    chosen = set()
    count = 0
    for c, b in mostPopular:
        count += c
        chosen.add(b)
        # slightly more aggressive threshold
        if count > 1.5 * totalRead / 2:
            break
    return chosen


def evaluateStrategy(returnSet, readValid, notRead):
    correct = 0
    for label, sample in [(1, readValid), (0, notRead)]:
        for u, b in sample:
            pred = 1 if b in returnSet else 0
            if pred == label:
                correct += 1
    return correct / (len(readValid) + len(notRead))

In [54]:
# Baseline popularity model
baselineSet = baseLineStrategy(mostPopular, totalRead)
baselineAcc = evaluateStrategy(baselineSet, readValid, notRead)
print("  Baseline popularity accuracy:", baselineAcc)

improvedSet = improvedStrategy(mostPopular, totalRead)
improvedAcc = evaluateStrategy(improvedSet, readValid, notRead)
print("  Improved popularity accuracy:", improvedAcc)

# Jaccard-based strategy (user-based "similar items" using co-rentals)

ratingsPerUser_all = defaultdict(list)
ratingsPerItem_all = defaultdict(list)
for d in train_data:
    u = d["user_id"]
    b = d["item_id"]
    # store dummy rating 1 for structure compatibility
    ratingsPerUser_all[u].append((b, 1))
    ratingsPerItem_all[b].append((u, 1))

  Baseline popularity accuracy: 0.7063224063587719
  Improved popularity accuracy: 0.743869811418775


In [55]:
def jaccardThresh(u, b, ratingsPerItem, ratingsPerUser):
    if b not in ratingsPerItem or u not in ratingsPerUser:
        # fallback to popularity threshold
        return 1 if len(ratingsPerItem.get(b, [])) > 40 else 0
    target_users = set([x[0] for x in ratingsPerItem[b]])
    maxSim = 0.0
    for b2, _ in ratingsPerUser[u]:
        users_b2 = set([x[0] for x in ratingsPerItem[b2]])
        sim = Jaccard(target_users, users_b2)
        if sim > maxSim:
            maxSim = sim
    if maxSim > 0.013 or len(ratingsPerItem[b]) > 40:
        return 1
    return 0


def evaluateJaccard(ratingsPerItem, ratingsPerUser, readValid, notRead):
    correct = 0
    for label, sample in [(1, readValid), (0, notRead)]:
        for u, b in sample:
            pred = jaccardThresh(u, b, ratingsPerItem, ratingsPerUser)
            if pred == label:
                correct += 1
    return correct / (len(readValid) + len(notRead))


jaccardAcc = evaluateJaccard(ratingsPerItem_all, ratingsPerUser_all, readValid, notRead)
print("  Jaccard-based accuracy:", jaccardAcc)


  Jaccard-based accuracy: 0.7464673489531923


In [56]:
def writePredictionsRent(ratingsPerItem, ratingsPerUser, in_pairs_path, out_path):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            pred = jaccardThresh(u, b, ratingsPerItem, ratingsPerUser)
            predictions.write(u + "," + b + "," + str(pred) + "\n")

In [None]:
##################################################
# Task 2 – Category / Event Prediction       #
#            (text → rented for)                #
##################################################

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import numpy as np

# IMPORTANT: use the actual key in the data
label_field = "rented for"  # key in the JSON
top_k = 10  # keep only the top-k most common labels


def collect_labels(dataset):
    labels = []
    for d in dataset:
        lab = d.get(label_field)
        if lab in (None, "", "nan"):
            continue
        labels.append(lab)
    return labels


# --- 1. Find top-k most common "rented for" labels on train set ---
all_train_labels = collect_labels(train_data)
label_counts = Counter(all_train_labels)

print("Rented-for prediction:")
print("  Total labeled train examples:", len(all_train_labels))
print("  Label counts (top 10):", label_counts.most_common(10))

top_k_labels = [lab for lab, _ in label_counts.most_common(top_k)]
print("  Top", top_k, "labels:", top_k_labels)


def extract_texts_and_rented_for(dataset, allowed_labels):
    texts = []
    labels = []
    for d in dataset:
        lab = d.get(label_field)
        if lab in (None, "", "nan"):
            continue
        if lab not in allowed_labels:  # drop rare labels
            continue
        texts.append(get_full_text(d))  # reuse review_text + summary
        labels.append(lab)
    return texts, np.array(labels)


# --- 2. Build train/valid sets restricted to top-k labels ---

train_texts_rf, y_rf_train = extract_texts_and_rented_for(train_data, top_k_labels)
valid_texts_rf, y_rf_valid = extract_texts_and_rented_for(valid_data, top_k_labels)

print("  Train samples (top-k only):", len(train_texts_rf))
print("  Valid samples (top-k only):", len(valid_texts_rf))

if len(train_texts_rf) == 0:
    raise ValueError(
        "No training samples found for 'rented for'. "
        "Check that label_field matches the key in your data."
    )

print("  Unique rented-for labels (train):", len(np.unique(y_rf_train)))

# --- 3. TF-IDF features (separate vectorizer for this task) ---

rf_vectorizer = TfidfVectorizer(
    max_features=12000,  # a bit richer than 8000
    ngram_range=(1, 2),  # unigrams + bigrams
    min_df=3,  # drop extremely rare tokens
    sublinear_tf=True,
)

X_rf_train = rf_vectorizer.fit_transform(train_texts_rf)
X_rf_valid = rf_vectorizer.transform(valid_texts_rf)

print("  X_rf_train shape:", X_rf_train.shape)

# --- 4. Encode rented-for labels ---

rf_le = LabelEncoder()
all_rf_labels = np.concatenate([y_rf_train, y_rf_valid])
rf_le.fit(all_rf_labels)

y_rf_train_enc = rf_le.transform(y_rf_train)
y_rf_valid_enc = rf_le.transform(y_rf_valid)

# --- 5. Hyperparameter search for best C ---

Cs = [0.5, 1.0, 2.0, 3.0, 5.0]
best_C = None
best_valid_acc = -1.0
best_clf = None

print("\nTuning C:")
for C in Cs:
    clf = LogisticRegression(
        max_iter=300,
        solver="saga",
        n_jobs=-1,
        C=C,
        random_state=0,
    )
    clf.fit(X_rf_train, y_rf_train_enc)
    train_pred = clf.predict(X_rf_train)
    valid_pred = clf.predict(X_rf_valid)

    train_acc = (train_pred == y_rf_train_enc).mean()
    valid_acc = (valid_pred == y_rf_valid_enc).mean()

    print(f"  C={C:<4}  Train={train_acc:.4f}  Valid={valid_acc:.4f}")

    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        best_C = C
        best_clf = clf

print(f"\nBest C: {best_C}  (valid accuracy = {best_valid_acc:.4f})")

# --- 6. Evaluate best model: accuracy + per-class metrics ---

rf_train_pred = best_clf.predict(X_rf_train)
rf_valid_pred = best_clf.predict(X_rf_valid)

rf_train_acc = (rf_train_pred == y_rf_train_enc).mean()
rf_valid_acc = (rf_valid_pred == y_rf_valid_enc).mean()

print("\nFinal model (rented-for prediction):")
print("  Train accuracy:", rf_train_acc)
print("  Valid accuracy:", rf_valid_acc)

# Decode predictions back to label names for nice reports
y_rf_valid_pred_labels = rf_le.inverse_transform(rf_valid_pred)

print("\nClassification report (valid):")
print(classification_report(y_rf_valid, y_rf_valid_pred_labels, digits=3))


Rented-for prediction:
  Total labeled train examples: 154026
  Label counts (top 10): [('wedding', 46300), ('formal affair', 32245), ('party', 28364), ('everyday', 13459), ('other', 12353), ('work', 12110), ('date', 5919), ('vacation', 3275), ('party: cocktail', 1)]
  Top 10 labels: ['wedding', 'formal affair', 'party', 'everyday', 'other', 'work', 'date', 'vacation', 'party: cocktail']
  Train samples (top-k only): 154026
  Valid samples (top-k only): 38508
  Unique rented-for labels (train): 9
  X_rf_train shape: (154026, 12000)

Tuning C:
  C=0.5   Train=0.6507  Valid=0.5985
  C=1.0   Train=0.6673  Valid=0.5960
  C=2.0   Train=0.6838  Valid=0.5904
  C=3.0   Train=0.6920  Valid=0.5864
  C=5.0   Train=0.7016  Valid=0.5798

Best C: 0.5  (valid accuracy = 0.5985)

Final model (rented-for prediction):
  Train accuracy: 0.6506628750990092
  Valid accuracy: 0.5984730445621689

Classification report (valid):
               precision    recall  f1-score   support

         date      0.665  

In [59]:
def writePredictionsCategory(
    model, words, wordId, wordSet, label_encoder, in_pairs_path, out_path
):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        pos = 0
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            # You would need to look up the review_text for (u, b) here.
            # Placeholder: empty text → all zeros except bias.
            feat = [0] * len(words) + [1]
            pred_label = model.predict(np.array(feat).reshape(1, -1))[0]
            pred_cat = label_encoder.inverse_transform([pred_label])[0]
            predictions.write(u + "," + b + "," + str(pred_cat) + "\n")
            pos += 1

In [60]:
def get_text_for_category(d):
    """
    Build the text used for both category and fit models.
    Uses review_text and (optionally) review_summary if it exists.
    """
    parts = []
    txt = d.get("review_text")
    summ = d.get("review_summary")  # it's okay if this field doesn't exist

    if txt:
        parts.append(clean_text(txt))
    if summ:
        parts.append(clean_text(summ))

    return " ".join(parts)


In [None]:
##################################################
# 4. Fit prediction (text → fit label)           #
##################################################


def extract_texts_and_fit_labels(dataset):
    texts = []
    labels = []
    for d in dataset:
        lab = d.get("fit")
        # skip missing / bad
        if lab in (None, "", "nan"):
            continue
        texts.append(
            get_text_for_category(d)
        )  # uses review_text (+ summary if you have it)
        labels.append(lab)
    return texts, np.array(labels)


train_texts_fit, y_fit_train = extract_texts_and_fit_labels(train_data)
valid_texts_fit, y_fit_valid = extract_texts_and_fit_labels(valid_data)

print("Fit prediction (classification):")
print("  Train samples:", len(train_texts_fit))
print("  Valid samples:", len(train_texts_fit))

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# Text features for fit prediction
fit_vectorizer = TfidfVectorizer(
    max_features=15000, ngram_range=(1, 2), min_df=3, sublinear_tf=True
)

X_fit_train = fit_vectorizer.fit_transform(train_texts_fit)
X_fit_valid = fit_vectorizer.transform(valid_texts_fit)

# Encode fit labels (e.g. "Small", "True to Size", "Large")
fit_le = LabelEncoder()
all_fit_labels = np.concatenate([y_fit_train, y_fit_valid])
fit_le.fit(all_fit_labels)


y_fit_train_enc = fit_le.transform(y_fit_train)
y_fit_valid_enc = fit_le.transform(y_fit_valid)

fit_clf = LogisticRegression(
    max_iter=200, solver="saga", n_jobs=-1, C=2.0, class_weight="balanced"
)

fit_clf.fit(X_fit_train, y_fit_train_enc)

fit_train_acc = (fit_clf.predict(X_fit_train) == y_fit_train_enc).mean()
fit_valid_acc = (fit_clf.predict(X_fit_valid) == y_fit_valid_enc).mean()

print("  Train accuracy:", fit_train_acc)
print("  Valid accuracy:", fit_valid_acc)


Fit prediction (classification):
  Train samples: 154035
  Valid samples: 154035
  Train accuracy: 0.7952088810984517
  Valid accuracy: 0.7428912721701421
