In [2]:
import gzip
from collections import UserDict, defaultdict
import pandas as pd
import csv
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV, cross_validate, KFold
import random

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        parts = l.strip().split(',')
        if len(parts) != 3:
            print(f"Skipping malformed line: {l.strip()}")
            continue
        u, b, r = parts
        r = int(r)  # Convert rank to integer
        yield u, b, r

In [4]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [5]:
len(allRatings)

200000

**predict rating**

In [6]:
ratingsTrain = allRatings[:int(len(allRatings)*0.95)]
ratingsValid = allRatings[int(len(allRatings)*0.95):]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

In [7]:
allRatings_df = pd.DataFrame(allRatings, columns=["userID", "bookID", "rating"])
reader = Reader(rating_scale=(allRatings_df['rating'].min(), allRatings_df['rating'].max()))
data = Dataset.load_from_df(allRatings_df[['userID', 'bookID', 'rating']], reader)

param_grid = {
    'n_factors': [10, 20, 30],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.05, 0.1, 0.2],
    'n_epochs': [50, 100]
}
kf = KFold(n_splits=5, random_state=42, shuffle=True)
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=kf, n_jobs=-1)
gs.fit(data)
best_params = gs.best_params['rmse']
print(f"Best Parameters: {best_params}")

algo = gs.best_estimator['rmse']
cv_results = cross_validate(algo, data, measures=['rmse'], cv=kf, verbose=True)

trainset = data.build_full_trainset()
algo.fit(trainset)

mu = allRatings_df['rating'].mean()
user_avg_rating = allRatings_df.groupby('userID')['rating'].mean().to_dict()
book_avg_rating = allRatings_df.groupby('bookID')['rating'].mean().to_dict()

def predict_rating(user, book):
    if user in user_avg_rating and book in book_avg_rating:
        return algo.predict(user, book).est
    elif user in user_avg_rating:
        return user_avg_rating[user]
    elif book in book_avg_rating:
        return book_avg_rating[book]
    else:
        return mu

Best Parameters: {'n_factors': 10, 'lr_all': 0.002, 'reg_all': 0.2, 'n_epochs': 100}
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2057  1.2091  1.2159  1.2022  1.1953  1.2056  0.0069  
Fit time          3.51    3.51    3.46    3.53    3.61    3.52    0.05    
Test time         0.29    0.21    0.14    0.12    0.11    0.17    0.07    


In [8]:
test_data = pd.read_csv('pairs_Rating.csv')

predictions_path = "predictions_Rating.csv"
with open(predictions_path, 'w') as predictions:
    predictions.write("userID,bookID,prediction\n")
    for _, row in test_data.iterrows():
        u, b = row['userID'], row['bookID']
        pred = predict_rating(u, b)
        predictions.write(f"{u},{b},{pred:.4f}\n")

print("Predictions saved to predictions_Rating.csv")

Predictions saved to predictions_Rating.csv


**predict reading**

In [9]:
ratingsTrain = allRatings[:int(len(allRatings)*0.95)]
ratingsValid = allRatings[int(len(allRatings)*0.95):]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

In [10]:
booksPerUser = defaultdict(set)
usersPerbook = defaultdict(set)
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

for user, book, _ in ratingsTrain:
    booksPerUser[user].add(book)
    usersPerbook[book].add(user)

positive_samples = [(user, book) for user, book, _ in ratingsValid]
negative_samples = []

for user, book in positive_samples:
    while True:
        negative_book = random.choice(list(bookCount.keys()))
        if negative_book not in booksPerUser[user]:  # Ensure this book wasn't read by the user
            negative_samples.append((user, negative_book))
            break

validation_set = positive_samples + negative_samples

In [11]:
class my_large_dict(UserDict):
    def __init__(self, input_data=None):
        super().__init__()
        if input_data:
            self.data = self.transform_data(input_data)
        else:
            self.data = {}

    def transform_data(self, input_data):
        book_record = defaultdict(list)
        for user_id, book_id, _ in input_data:
            book_record[book_id].append(user_id)
        
        return book_record

    def add_item(self, user_id, book_id, _):
        if book_id in self.data:
            self.data[book_id].append(user_id)
        else:
            self.data[book_id] = [user_id]
        return self.data

In [12]:
allBooks = set(book for _, book, _ in ratingsTrain)
negativeSamples = []
for user, book, _ in ratingsValid:
    while True:
        negativeBook = random.choice(list(allBooks))
        if negativeBook not in [b for b, _ in ratingsPerUser[user]]:
            negativeSamples.append((user, negativeBook, -1))
            break

validationSet = ratingsValid + negativeSamples
random.shuffle(validationSet)
labels = [1] * len(positive_samples) + [0] * len(negative_samples)  # 1 for positive, 0 for negative

In [13]:
read_test = pd.read_csv('pairs_Read.csv')
read_test = list(zip(read_test['userID'], read_test['bookID']))
read_test[:2]

[('u95048695', 'b80407575'), ('u64624839', 'b22251874')]

In [14]:
usersPerBook = defaultdict(set)
bookPopularity = defaultdict(int)
for user, book, _ in ratingsTrain:
    usersPerBook[book].add(user)
    bookPopularity[book] += 1

class ImprovedPredictor:
    def __init__(self, jaccard_threshold, popularity_threshold):
        self.jaccard_threshold = jaccard_threshold
        self.popularity_threshold = popularity_threshold

    def jaccard_similarity(self, book1, book2):
        users1 = usersPerBook[book1]
        users2 = usersPerBook[book2]
        intersection = len(users1.intersection(users2))
        union = len(users1.union(users2))
        return intersection / union if union != 0 else 0

    def predict(self, user, book):
        books_read_by_user = [b for b, _ in ratingsPerUser[user]]
        max_similarity = 0
        for b_prime in books_read_by_user:
            similarity = self.jaccard_similarity(book, b_prime)
            max_similarity = max(max_similarity, similarity)
        book_popularity = bookPopularity[book]
        
        return (max_similarity >= self.jaccard_threshold) or (book_popularity >= self.popularity_threshold)

In [None]:
best_accuracy = 0
best_jaccard_threshold = 0
best_popularity_threshold = 0

jaccard_thresholds = [i / 100 for i in range(1, 101)]
popularity_thresholds = [i / 10 for i in range(100, 1001, 10)]

for jt in jaccard_thresholds:
    for pt in popularity_thresholds:
        predictor = ImprovedPredictor(jaccard_threshold=jt, popularity_threshold=pt)
        predictions = [predictor.predict(user, book) for user, book in validation_set]
        correct = sum([pred == label for pred, label in zip(predictions, labels)])
        accuracy = correct / len(validation_set)
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_jaccard_threshold = jt
            best_popularity_threshold = pt

print("Best Jaccard threshold:", best_jaccard_threshold)
print("Best Popularity threshold:", best_popularity_threshold)
print("Best accuracy:", best_accuracy)

In [None]:
usersPerBook = defaultdict(set)
bookPopularity = defaultdict(int)
for user, book, _ in ratingsTrain:
    usersPerBook[book].add(user)
    bookPopularity[book] += 1

class ImprovedPredictor:
    def __init__(self, jaccard_threshold, popularity_threshold):
        self.jaccard_threshold = jaccard_threshold
        self.popularity_threshold = popularity_threshold

    def jaccard_similarity(self, book1, book2):
        users1 = usersPerBook[book1]
        users2 = usersPerBook[book2]
        intersection = len(users1.intersection(users2))
        union = len(users1.union(users2))
        return intersection / union if union != 0 else 0

    def predict(self, user, book):
        books_read_by_user = [b for b, _ in ratingsPerUser[user]]
        max_similarity = 0
        for b_prime in books_read_by_user:
            similarity = self.jaccard_similarity(book, b_prime)
            max_similarity = max(max_similarity, similarity)

        book_popularity = bookPopularity[book]

        return (max_similarity >= self.jaccard_threshold) or (book_popularity >= self.popularity_threshold)

predictor = ImprovedPredictor(jaccard_threshold=0.05, popularity_threshold=30)
        
predictions = [(user, book, predictor.predict(user, book)) for user, book in read_test]
predictions

[('u95048695', 'b80407575', True),
 ('u64624839', 'b22251874', True),
 ('u45364671', 'b59334959', False),
 ('u89964247', 'b96807645', False),
 ('u27746462', 'b93777449', False),
 ('u45033856', 'b02250808', True),
 ('u27598419', 'b12715496', True),
 ('u20059243', 'b74622707', False),
 ('u52760220', 'b33229445', True),
 ('u95557022', 'b55417152', False),
 ('u43877046', 'b06804790', False),
 ('u89426875', 'b43975237', True),
 ('u40203344', 'b56379602', False),
 ('u18008976', 'b51752826', True),
 ('u62617835', 'b35509876', False),
 ('u91244617', 'b25350833', True),
 ('u98269724', 'b32262230', False),
 ('u27067409', 'b12138527', False),
 ('u62105013', 'b85897452', True),
 ('u10645797', 'b21827216', False),
 ('u77630454', 'b39380938', False),
 ('u85624886', 'b62181041', False),
 ('u11629036', 'b65888554', True),
 ('u45734444', 'b72571590', True),
 ('u47367123', 'b93239672', True),
 ('u25577220', 'b33892919', True),
 ('u46256540', 'b18929421', True),
 ('u98893115', 'b34817039', False),
 ('u98

In [None]:
usersPerBook = defaultdict(set)
bookPopularity = defaultdict(int)

for user, book, _ in ratingsTrain:
    usersPerBook[book].add(user)
    bookPopularity[book] += 1

class ImprovedPredictor:
    def __init__(self, jaccard_threshold, popularity_threshold):
        self.jaccard_threshold = jaccard_threshold
        self.popularity_threshold = popularity_threshold

    def jaccard_similarity(self, book1, book2):
        users1 = usersPerBook[book1]
        users2 = usersPerBook[book2]
        intersection = len(users1.intersection(users2))
        union = len(users1.union(users2))
        return intersection / union if union != 0 else 0

    def predict(self, user, book):
        books_read_by_user = [b for b, _ in ratingsPerUser[user]]
        max_similarity = 0
        for b_prime in books_read_by_user:
            similarity = self.jaccard_similarity(book, b_prime)
            max_similarity = max(max_similarity, similarity)
        
        book_popularity = bookPopularity[book]
        
        return (max_similarity >= self.jaccard_threshold) or (book_popularity >= self.popularity_threshold)

Fold 1: Accuracy = 0.9463
Fold 2: Accuracy = 0.9452
Fold 3: Accuracy = 0.9445
Fold 4: Accuracy = 0.9462
Fold 5: Accuracy = 0.9479
Average Accuracy across all folds: 0.9460
Predictions completed.


In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
jaccard_threshold = 0.05
popularity_threshold = 30

fold_results = []
for fold, (train_index, valid_index) in enumerate(kf.split(ratingsTrain)):
    train_data = [ratingsTrain[i] for i in train_index]
    valid_data = [ratingsTrain[i] for i in valid_index]

    usersPerBook_fold = defaultdict(set)
    bookPopularity_fold = defaultdict(int)
    ratingsPerUser_fold = defaultdict(list)
    
    for user, book, rating in train_data:
        usersPerBook_fold[book].add(user)
        bookPopularity_fold[book] += 1
        ratingsPerUser_fold[user].append((book, rating))

    predictor = ImprovedPredictor(jaccard_threshold, popularity_threshold)

    predictions = [(user, book, predictor.predict(user, book)) for user, book, _ in valid_data]
    
    accuracy = sum((pred == (rating > 0.5)) for (_, _, rating), (_, _, pred) in zip(valid_data, predictions)) / len(valid_data)
    fold_results.append(accuracy)
    print(f"Fold {fold + 1}: Accuracy = {accuracy:.4f}")

average_accuracy = sum(fold_results) / len(fold_results)
print(f"Average Accuracy across all folds: {average_accuracy:.4f}")

predictor = ImprovedPredictor(jaccard_threshold, popularity_threshold)
predictions = [(user, book, predictor.predict(user, book)) for user, book in read_test]

print("Predictions completed.")

NameError: name 'KFold' is not defined

In [None]:
with open("predictions_Read.csv", "w", newline="") as predictions_file:
    writer = csv.writer(predictions_file)
    writer.writerow(["userID", "bookID", "prediction"])
    for user, book, prediction in predictions:
        writer.writerow([user, book, int(prediction)])

print("Predictions have been written to 'predictions_Read.csv'.")

Predictions have been written to 'predictions_Read.csv'.
