In [3]:
##################################################
# Imports & utilities                            #
##################################################

import gzip
import json
import random
from collections import defaultdict
import math
import string

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [5]:
##################################################
# Load dataset                                   #
##################################################


def read_rtr(path):
    with open(path, "r") as f:
        for line in f:
            yield json.loads(line)


DATA_PATH = "renttherunway_final_data.json"

data = list(read_rtr(DATA_PATH))
print("Total records:", len(data))

Total records: 192544


In [6]:
##################################################
# Train / validation split                       #
##################################################

indices = list(range(len(data)))
random.shuffle(indices)
split = int(0.8 * len(indices))
train_idx = set(indices[:split])
valid_idx = set(indices[split:])

train_data = [data[i] for i in train_idx]
valid_data = [data[i] for i in valid_idx]

print("Train size:", len(train_data))
print("Valid size:", len(valid_data))

Train size: 154035
Valid size: 38509


In [7]:
##################################################
# Shared helpers                                 #
##################################################


def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer / denom
    return 0.0

In [8]:
##################################################
# Shared helpers                                 #
##################################################


def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer / denom
    return 0.0

In [9]:
##################################################
# 1. Rating prediction                           #
##################################################
# r ≈ alpha + beta_u + beta_i  (user/item bias model)


# Build rating triples (u, i, r) from data
def extract_ratings(dataset):
    ratings = []
    for d in dataset:
        if "rating" in d and d["rating"] not in (None, "", "nan"):
            try:
                r = float(d["rating"])
            except ValueError:
                continue
            u = d["user_id"]
            i = d["item_id"]
            ratings.append((u, i, r))
    return ratings


ratingsTrain = extract_ratings(train_data)
ratingsValid = extract_ratings(valid_data)

print("Train ratings:", len(ratingsTrain))
print("Valid ratings:", len(ratingsValid))

# Build per-user / per-item maps
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u, b, r in ratingsTrain:
    ratingsPerUser[u].append((b, r))
    ratingsPerItem[b].append((u, r))

Train ratings: 153973
Valid ratings: 38489


In [10]:
def getGlobalAverage(trainRatings):
    return sum(r for (_, _, r) in trainRatings) / len(trainRatings)


def alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb):
    newAlpha = 0.0
    for u, b, r in ratingsTrain:
        newAlpha += r - (betaU.get(u, 0.0) + betaI.get(b, 0.0))
    return newAlpha / len(ratingsTrain)


def betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb):
    newBetaU = {}
    for u in ratingsPerUser:
        num = 0.0
        for b, r in ratingsPerUser[u]:
            num += r - (alpha + betaI.get(b, 0.0))
        newBetaU[u] = num / (lamb + len(ratingsPerUser[u]))
    return newBetaU


def betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb):
    newBetaI = {}
    for b in ratingsPerItem:
        num = 0.0
        for u, r in ratingsPerItem[b]:
            num += r - (alpha + betaU.get(u, 0.0))
        newBetaI[b] = num / (lamb + len(ratingsPerItem[b]))
    return newBetaI


def msePlusReg(ratingsTrain, alpha, betaU, betaI, lamb):
    mse = 0.0
    for u, b, r in ratingsTrain:
        pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
        mse += (r - pred) ** 2
    mse /= len(ratingsTrain)
    reg = 0.0
    for u in betaU:
        reg += betaU[u] ** 2
    for b in betaI:
        reg += betaI[b] ** 2
    return mse, mse + lamb * reg


def validMSE(ratingsValid, alpha, betaU, betaI):
    mse = 0.0
    for u, b, r in ratingsValid:
        pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
        mse += (r - pred) ** 2
    mse /= len(ratingsValid)
    return mse


def train_bias_model(ratingsTrain, ratingsPerUser, ratingsPerItem, lamb=1.0, iters=5):
    alpha = getGlobalAverage(ratingsTrain)
    betaU = defaultdict(float)
    betaI = defaultdict(float)
    for _ in range(iters):
        alpha = alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb)
        betaU = betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb)
        betaI = betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb)
    return alpha, betaU, betaI

In [11]:
alpha, betaU, betaI = train_bias_model(ratingsTrain, ratingsPerUser, ratingsPerItem)

train_mse, train_obj = msePlusReg(ratingsTrain, alpha, betaU, betaI, lamb=1.0)
valid_mse = validMSE(ratingsValid, alpha, betaU, betaI)

print("Rating prediction:")
print("  Train MSE:", train_mse)
print("  Valid MSE:", valid_mse)

Rating prediction:
  Train MSE: 0.9031465439185892
  Valid MSE: 2.058030190402456


In [12]:
# ITS DEFINITELY OPTIONAL


def writePredictionsRating(alpha, betaU, betaI, in_pairs_path, out_path):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            pred = alpha + betaU.get(u, 0.0) + betaI.get(b, 0.0)
            predictions.write(u + "," + b + "," + str(pred) + "\n")


In [13]:
##################################################
# 2. Rental prediction (Read → Rent)             #
##################################################

# Treat every observed (user_id, item_id) as a positive "rented" interaction.

all_rentals = set()
userSet = set()
itemSet = set()

for d in data:
    u = d["user_id"]
    b = d["item_id"]
    userSet.add(u)
    itemSet.add(b)
    all_rentals.add((u, b))

userList = sorted(list(userSet))
itemList = sorted(list(itemSet))

In [14]:
# Build positives from validation ratings-style data or from valid_data directly.
# We'll just use valid_data as the source of positive rentals.
readValid = set()
for d in valid_data:
    readValid.add((d["user_id"], d["item_id"]))

# Generate negative samples: one not-rented item per positive pair
notRead = set()
for u, b in readValid:
    b_neg = random.choice(itemList)
    while (u, b_neg) in all_rentals or (u, b_neg) in notRead:
        b_neg = random.choice(itemList)
    notRead.add((u, b_neg))

print("Rental prediction validation:")
print("  Positives:", len(readValid))
print("  Negatives:", len(notRead))

# Popularity counts (on train_data)
itemCount = defaultdict(int)
for d in train_data:
    itemCount[d["item_id"]] += 1

totalRead = sum(itemCount.values())
mostPopular = sorted([(c, b) for b, c in itemCount.items()], reverse=True)

Rental prediction validation:
  Positives: 38497
  Negatives: 38497


In [15]:
def baseLineStrategy(mostPopular, totalRead):
    chosen = set()
    count = 0
    for c, b in mostPopular:
        count += c
        chosen.add(b)
        if count > totalRead / 2:
            break
    return chosen


def improvedStrategy(mostPopular, totalRead):
    chosen = set()
    count = 0
    for c, b in mostPopular:
        count += c
        chosen.add(b)
        # slightly more aggressive threshold
        if count > 1.5 * totalRead / 2:
            break
    return chosen


def evaluateStrategy(returnSet, readValid, notRead):
    correct = 0
    for label, sample in [(1, readValid), (0, notRead)]:
        for u, b in sample:
            pred = 1 if b in returnSet else 0
            if pred == label:
                correct += 1
    return correct / (len(readValid) + len(notRead))

In [None]:
# Baseline popularity model
baselineSet = baseLineStrategy(mostPopular, totalRead)
baselineAcc = evaluateStrategy(baselineSet, readValid, notRead)
print("  Baseline popularity accuracy:", baselineAcc)

improvedSet = improvedStrategy(mostPopular, totalRead)
improvedAcc = evaluateStrategy(improvedSet, readValid, notRead)
print("  Improved popularity accuracy:", improvedAcc)

# Jaccard-based strategy (user-based "similar items" using co-rentals)

ratingsPerUser_all = defaultdict(list)
ratingsPerItem_all = defaultdict(list)
for d in train_data:
    u = d["user_id"]
    b = d["item_id"]
    # store dummy rating 1 for structure compatibility
    ratingsPerUser_all[u].append((b, 1))
    ratingsPerItem_all[b].append((u, 1))

  Baseline popularity accuracy: 0.7070940592773463
  Improved popularity accuracy: 0.7417590981115412


In [17]:
def jaccardThresh(u, b, ratingsPerItem, ratingsPerUser):
    if b not in ratingsPerItem or u not in ratingsPerUser:
        # fallback to popularity threshold
        return 1 if len(ratingsPerItem.get(b, [])) > 40 else 0
    target_users = set([x[0] for x in ratingsPerItem[b]])
    maxSim = 0.0
    for b2, _ in ratingsPerUser[u]:
        users_b2 = set([x[0] for x in ratingsPerItem[b2]])
        sim = Jaccard(target_users, users_b2)
        if sim > maxSim:
            maxSim = sim
    if maxSim > 0.013 or len(ratingsPerItem[b]) > 40:
        return 1
    return 0


def evaluateJaccard(ratingsPerItem, ratingsPerUser, readValid, notRead):
    correct = 0
    for label, sample in [(1, readValid), (0, notRead)]:
        for u, b in sample:
            pred = jaccardThresh(u, b, ratingsPerItem, ratingsPerUser)
            if pred == label:
                correct += 1
    return correct / (len(readValid) + len(notRead))


jaccardAcc = evaluateJaccard(ratingsPerItem_all, ratingsPerUser_all, readValid, notRead)
print("  Jaccard-based accuracy:", jaccardAcc)


  Jaccard-based accuracy: 0.7467205236771697


In [18]:
def writePredictionsRent(ratingsPerItem, ratingsPerUser, in_pairs_path, out_path):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            pred = jaccardThresh(u, b, ratingsPerItem, ratingsPerUser)
            predictions.write(u + "," + b + "," + str(pred) + "\n")

In [23]:
##################################################
# 3. Category prediction (text → category)       #
##################################################

punctuation = set(string.punctuation)


def clean_text(s):
    if s is None:
        return ""
    s = s.lower()
    return "".join(c for c in s if c not in punctuation)


def build_vocabulary(dataset, field="review_text", vocab_size=1000):
    wordCount = defaultdict(int)
    for d in dataset:
        text = clean_text(d.get(field, ""))
        for w in text.split():
            wordCount[w] += 1
    counts = sorted([(c, w) for w, c in wordCount.items()], reverse=True)
    words = [w for (c, w) in counts[:vocab_size]]
    wordId = {w: i for i, w in enumerate(words)}
    wordSet = set(words)
    return words, wordId, wordSet


def text_features(datum, words, wordId, wordSet, field="review_text"):
    feat = [0] * len(words)
    text = clean_text(datum.get(field, ""))
    for w in text.split():
        if w in wordSet:
            feat[wordId[w]] += 1
    feat.append(1)  # bias term
    return feat

In [25]:
# Build vocabulary on training data
cat_words, cat_wordId, cat_wordSet = build_vocabulary(
    train_data, field="review_text", vocab_size=1000
)


# Build X, y for category prediction
def build_category_data(dataset):
    X = []
    y = []
    for d in dataset:
        if "category" not in d or d["category"] in (None, "", "nan"):
            continue
        X.append(
            text_features(d, cat_words, cat_wordId, cat_wordSet, field="review_text")
        )
        y.append(d["category"])
    return np.array(X), np.array(y)


X_cat_train, y_cat_train = build_category_data(train_data)
X_cat_valid, y_cat_valid = build_category_data(valid_data)

print("Category prediction:")
print("  Train samples:", X_cat_train.shape[0])
print("  Valid samples:", X_cat_valid.shape[0])

cat_le = LabelEncoder()

# Fit on all labels (train + valid)
all_cats = np.concatenate([y_cat_train, y_cat_valid])
cat_le.fit(all_cats)

# Now transform each split
y_cat_train_enc = cat_le.transform(y_cat_train)
y_cat_valid_enc = cat_le.transform(y_cat_valid)

cat_clf = LogisticRegression(max_iter=1000, multi_class="auto")
cat_clf.fit(X_cat_train, y_cat_train_enc)

cat_train_acc = (cat_clf.predict(X_cat_train) == y_cat_train_enc).mean()
cat_valid_acc = (cat_clf.predict(X_cat_valid) == y_cat_valid_enc).mean()

print("  Train accuracy:", cat_train_acc)
print("  Valid accuracy:", cat_valid_acc)

Category prediction:
  Train samples: 154035
  Valid samples: 38509


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Train accuracy: 0.6721459408575973
  Valid accuracy: 0.6426289958191591


In [26]:
def writePredictionsCategory(
    model, words, wordId, wordSet, label_encoder, in_pairs_path, out_path
):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        pos = 0
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            # You would need to look up the review_text for (u, b) here.
            # Placeholder: empty text → all zeros except bias.
            feat = [0] * len(words) + [1]
            pred_label = model.predict(np.array(feat).reshape(1, -1))[0]
            pred_cat = label_encoder.inverse_transform([pred_label])[0]
            predictions.write(u + "," + b + "," + str(pred_cat) + "\n")
            pos += 1

In [27]:
##################################################
# 4. Fit prediction (bonus)                      #
##################################################
# Predict datum["fit"] from review_text (same features as category).


def build_fit_data(dataset):
    X = []
    y = []
    for d in dataset:
        if "fit" not in d or d["fit"] in (None, "", "nan"):
            continue
        X.append(
            text_features(d, cat_words, cat_wordId, cat_wordSet, field="review_text")
        )
        y.append(d["fit"])
    return np.array(X), np.array(y)


X_fit_train, y_fit_train = build_fit_data(train_data)
X_fit_valid, y_fit_valid = build_fit_data(valid_data)

print("Fit prediction:")
print("  Train samples:", X_fit_train.shape[0])
print("  Valid samples:", X_fit_valid.shape[0])

fit_le = LabelEncoder()
y_fit_train_enc = fit_le.fit_transform(y_fit_train)
y_fit_valid_enc = fit_le.transform(y_fit_valid)

fit_clf = LogisticRegression(max_iter=1000, multi_class="auto")
fit_clf.fit(X_fit_train, y_fit_train_enc)

fit_train_acc = (fit_clf.predict(X_fit_train) == y_fit_train_enc).mean()
fit_valid_acc = (fit_clf.predict(X_fit_valid) == y_fit_valid_enc).mean()

print("  Train accuracy:", fit_train_acc)
print("  Valid accuracy:", fit_valid_acc)


def writePredictionsFit(
    model, words, wordId, wordSet, label_encoder, in_pairs_path, out_path
):
    with open(out_path, "w") as predictions, open(in_pairs_path) as pairs:
        for l in pairs:
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u, b = l.strip().split(",")
            # Again, you'd need (u,b) → review_text mapping here.
            feat = [0] * len(words) + [1]
            pred_label = model.predict(np.array(feat).reshape(1, -1))[0]
            pred_fit = label_encoder.inverse_transform([pred_label])[0]
            predictions.write(u + "," + b + "," + str(pred_fit) + "\n")


Fit prediction:
  Train samples: 154035
  Valid samples: 38509




  Train accuracy: 0.8058687960528451
  Valid accuracy: 0.8045651665844348
