In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error

from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler


In [2]:
def readGz(path):
  for l in gzip.open(path, 'rt'):
    yield eval(l)

def readCSV(path):
  f = gzip.open(path, 'rt')
  f.readline()
  for l in f:
    yield l.strip().split(',')

In [16]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

In [3]:
### Would-read baseline: just rank which books are popular and which are not, and return '1' if a book is among the top-ranked

bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
  bookCount[book] += 1
  totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()


In [4]:
def generateValidation(allRatings, ratingsValid):
    # Using ratingsValid, generate two sets:
    # readValid: set of (u,b) pairs in the validation set
    # notRead: set of (u,b') pairs, containing one negative (not read) for each row (u) in readValid  
    # Both should have the same size as ratingsValid

    allRead = set((d[0], d[1]) for d in allRatings)
    readValid = set((d[0], d[1]) for d in ratingsValid)
    bookperuser = defaultdict(set)
    for u, b in allRead:
        bookperuser[u].add(b)
    
    notRead = set()
    allBooks = list({b for _, b in allRead})  # all unique book IDs

    for u, b in readValid:
        unread = random.choice(allBooks)
        while unread in bookperuser[u]:
            unread = random.choice(allBooks)
        notRead.add((u, unread))

    return readValid, notRead

In [5]:
def cosine_sim_binary(b1, b2, ratingsPerItem):
    users1 = set(u for u,_ in ratingsPerItem.get(b1, []))
    users2 = set(u for u,_ in ratingsPerItem.get(b2, []))
    if not users1 or not users2: return 0.0
    inter = len(users1 & users2)
    denom = math.sqrt(len(users1) * len(users2))
    return inter / denom if denom else 0.0

def cosine_weighted(b1, b2, ratingsPerItem):
    users1 = set(u for u,_ in ratingsPerItem.get(b1, []))
    users2 = set(u for u,_ in ratingsPerItem.get(b2, []))
    if not users1 or not users2: return 0.0
    inter = len(users1 & users2)
    denom = (len(users1)**0.25)*(len(users2)**0.25)
    return inter / denom if denom else 0.0

def jaccard_sim_binary_items(b1, b2, ratingsPerItem):
    users1 = set(u for u,_ in ratingsPerItem.get(b1, []))
    users2 = set(u for u,_ in ratingsPerItem.get(b2, []))
    if not users1 or not users2: return 0.0
    inter = len(users1 & users2)
    union = len(users1 | users2)
    return inter / union if union else 0.0

def jaccard_sim_binary_users(u1, u2, ratingsPerUser):
    books1 = set(b for b,_ in ratingsPerUser.get(u1, []))
    books2 = set(b for b,_ in ratingsPerUser.get(u2, []))
    if not books1 or not books2: return 0.0
    inter = len(books1 & books2)
    union = len(books1 | books2)
    return inter / union if union else 0.0


In [6]:
def build_feature_df(allRatings, ratingsValid, ratingsPerUser, ratingsPerItem, bookCount, totalRead):
    readValid, notRead = generateValidation(allRatings, ratingsValid)
    data = []
    for (u,b,label) in [(u,b,1) for (u,b) in readValid] + [(u,b,0) for (u,b) in notRead]:

        # user’s read books
        user_books = [bk for bk,_ in ratingsPerUser.get(u, [])]
        # item’s readers
        book_users = [us for us,_ in ratingsPerItem.get(b, [])]

        if not user_books or not book_users:
            data.append([u,b,0,0,0,0,bookCount.get(b,0)/totalRead,label])
            continue

        # max similarities
        sim_cosine = max(cosine_sim_binary(b, other, ratingsPerItem) for other in user_books)
        sim_weighted = max(cosine_weighted(b, other, ratingsPerItem) for other in user_books)
        sim_jaccard_item = max(jaccard_sim_binary_items(b, other, ratingsPerItem) for other in user_books)
        sim_jaccard_user = max(jaccard_sim_binary_users(u, other, ratingsPerUser) for other in book_users)
        popScore = bookCount.get(b,0) / totalRead

        data.append([u,b,sim_cosine,sim_weighted,sim_jaccard_item,sim_jaccard_user,popScore,label])

    df = pd.DataFrame(data, columns=[
        "user","book","cosine","cosine_weighted","item_jaccard","user_jaccard","pop","label"
    ])
    return df


In [7]:
def train_logistic_model(df):
    X = df[["cosine","cosine_weighted","item_jaccard","user_jaccard","pop"]]
    y = df["label"]
    model = linear_model.LogisticRegression(max_iter=500)
    model.fit(X, y)
    return model

In [8]:
def predict_balanced_per_user(df, model):
    X = df[["cosine","cosine_weighted","item_jaccard","user_jaccard","pop"]]
    df["prob"] = model.predict_proba(X)[:,1]

    preds = []
    for u, grp in df.groupby("user"):
        # sort by predicted probability descending
        sorted_grp = grp.sort_values("prob", ascending=False)
        n = len(sorted_grp)
        cutoff = n//2
        sorted_grp["pred"] = 0
        sorted_grp.iloc[:cutoff, sorted_grp.columns.get_loc("pred")] = 1
        preds.append(sorted_grp)
    df_pred = pd.concat(preds, ignore_index=True)
    return df_pred

In [None]:
def evaluate_accuracy(df_pred):
    return accuracy_score(df_pred["label"], df_pred["pred"])

In [13]:
df = build_feature_df(allRatings, ratingsValid, ratingsPerUser, ratingsPerItem, bookCount, totalRead)
print("Feature table built:", df.shape)

model = train_logistic_model(df)
df_pred = predict_balanced_per_user(df, model)
acc = evaluate_accuracy(df_pred)
print(f"Balanced-per-user logistic model accuracy: {acc:.4f}")


Feature table built: (19999, 8)
Balanced-per-user logistic model accuracy: 0.7799


In [14]:
def build_feature_df_for_pairs(pairs_path, ratingsPerUser, ratingsPerItem, bookCount, totalRead):
    """Reuses the same feature computation logic as training but for unlabeled test pairs."""
    test_data = []
    with open(pairs_path) as f:
        next(f)  # skip header line
        for line in f:
            u,b = line.strip().split(',')
            user_books = [bk for bk,_ in ratingsPerUser.get(u, [])]
            book_users = [us for us,_ in ratingsPerItem.get(b, [])]

            if not user_books or not book_users:
                test_data.append([u,b,0,0,0,0,bookCount.get(b,0)/totalRead])
                continue

            sim_cosine = max(cosine_sim_binary(b, other, ratingsPerItem) for other in user_books)
            sim_weighted = max(cosine_weighted(b, other, ratingsPerItem) for other in user_books)
            sim_jaccard_item = max(jaccard_sim_binary_items(b, other, ratingsPerItem) for other in user_books)
            sim_jaccard_user = max(jaccard_sim_binary_users(u, other, ratingsPerUser) for other in book_users)
            popScore = bookCount.get(b,0) / totalRead

            test_data.append([u,b,sim_cosine,sim_weighted,sim_jaccard_item,sim_jaccard_user,popScore])

    df_test = pd.DataFrame(test_data, columns=[
        "user","book","cosine","cosine_weighted","item_jaccard","user_jaccard","pop"
    ])
    return df_test


In [15]:
df_test = build_feature_df_for_pairs("pairs_Read.csv", ratingsPerUser, ratingsPerItem, bookCount, totalRead)
df_pred = predict_balanced_per_user(df_test, model)

predictions = open("predictions_Read.csv", 'w')
predictions.write("userID,bookID,Prediction\n")

for _, row in df_pred.iterrows():
    predictions.write(f"{row.user},{row.book},{row.pred}\n")

predictions.close()
print("✅ Done. Predictions written to predictions_Read.csv")

✅ Done. Predictions written to predictions_Read.csv
