## Data Preparation

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [5]:
df = pd.read_csv('/workspaces/book-recommendation-system/dataset/Ratings.csv', sep=';', dtype={'User-ID': str, 'ISBN': str})
print("First few rows of the original data:")
print(df.head())
print("\nData types of columns:")
print(df.dtypes)

First few rows of the original data:
  User-ID        ISBN  Rating
0  276725  034545104X       0
1  276726  0155061224       5
2  276727  0446520802       0
3  276729  052165615X       3
4  276729  0521795028       6

Data types of columns:
User-ID    object
ISBN       object
Rating      int64
dtype: object


In [7]:
duplicates = df[df.duplicated(subset=['User-ID', 'ISBN'], keep=False)]
if not duplicates.empty:
    print("\nWarning: Duplicate entries found:")
    print(duplicates)
else:
    print("\nNo duplicate entries found.")


No duplicate entries found.


In [8]:
sorted_users = df['User-ID'].unique()
sorted_books = df['ISBN'].unique()

In [9]:
user_to_index = {user: index for index, user in enumerate(sorted_users)}
book_to_index = {book: index for index, book in enumerate(sorted_books)}

In [10]:
df['user_num'] = df['User-ID'].map(user_to_index)
df['book_num'] = df['ISBN'].map(book_to_index)

In [13]:
ratings_matrix = csr_matrix((df['Rating'], (df['user_num'], df['book_num'])))

In [14]:
print("\nFirst few user ID mappings:")
for i, user_id in enumerate(sorted_users[:5]):
    print(f"User-{i+1}: Original User-ID {user_id}")
print("\nFirst few book ISBN mappings:")
for i, isbn in enumerate(sorted_books[:5]):
    print(f"Book-{i+1}: Original ISBN {isbn}")


First few user ID mappings:
User-1: Original User-ID 276725
User-2: Original User-ID 276726
User-3: Original User-ID 276727
User-4: Original User-ID 276729
User-5: Original User-ID 276733

First few book ISBN mappings:
Book-1: Original ISBN 034545104X
Book-2: Original ISBN 0155061224
Book-3: Original ISBN 0446520802
Book-4: Original ISBN 052165615X
Book-5: Original ISBN 0521795028


In [15]:
print(f"\nMatrix shape: {ratings_matrix.shape}")
print(f"Non-zero elements: {ratings_matrix.nnz}")
print(f"Sparsity: {1 - (ratings_matrix.nnz / (ratings_matrix.shape[0] * ratings_matrix.shape[1])):.6f}")


Matrix shape: (105283, 340556)
Non-zero elements: 1149780
Sparsity: 0.999968


In [16]:
def print_submatrix(matrix, rows, cols):
    submatrix = matrix[:rows, :cols].toarray()
    print(f"\nFirst {rows}x{cols} submatrix:")
    print("")
    for i in range(rows):
        row = [f"{int(submatrix[i, j])}" if submatrix[i, j] != 0 else "0" for j in range(cols)]
        print(f"User {i+1}: [{' '.join(row)}]") 
print_submatrix(ratings_matrix, 10, 10)


First 10x10 submatrix:

User 1: [0 0 0 0 0 0 0 0 0 0]
User 2: [0 5 0 0 0 0 0 0 0 0]
User 3: [0 0 0 0 0 0 0 0 0 0]
User 4: [0 0 0 3 6 0 0 0 0 0]
User 5: [0 0 0 0 0 0 0 0 0 0]
User 6: [0 0 0 0 0 0 8 0 0 0]
User 7: [0 0 0 0 0 0 0 6 0 0]
User 8: [0 0 0 0 0 0 0 0 7 0]
User 9: [0 0 0 0 0 0 0 0 0 10]
User 10: [0 0 0 0 0 0 0 0 0 0]


In [17]:
def print_user_ratings(user_id):
    user_index = user_to_index[user_id]
    user_ratings = ratings_matrix[user_index].toarray()[0]
    non_zero_ratings = [(book_to_index[sorted_books[i]], rating) for i, rating in enumerate(user_ratings) if rating != 0]
    print(f"\nRatings for User {user_id}:")
    for book_index, rating in non_zero_ratings:
        print(f"Book {book_index + 1} (ISBN: {sorted_books[book_index]}): Rating {int(rating)}")

In [19]:
for user_id in sorted_users[:5]:
    print_user_ratings(user_id)


Ratings for User 276725:

Ratings for User 276726:
Book 2 (ISBN: 0155061224): Rating 5

Ratings for User 276727:

Ratings for User 276729:
Book 4 (ISBN: 052165615X): Rating 3
Book 5 (ISBN: 0521795028): Rating 6

Ratings for User 276733:


In [23]:
def sparse_to_libsvm(sparse_matrix, output_file):
    with open(output_file, 'w') as f:
        for i in range(sparse_matrix.shape[0]):
            row = sparse_matrix.getrow(i)
            line = f"{i+1}" # User ID (1-indexed)  
            for j, v in zip(row.indices, row.data):
                line += f" {j+1}:{int(v)}" # Book ID (1-indexed) : Rating 
            f.write(line + '\n') 

In [24]:
sparse_to_libsvm(ratings_matrix, 'user_book_ratings.libsvm')
print("\nSaved user-book ratings in libsvm format to 'user_book_ratings.libsvm'")


Saved user-book ratings in libsvm format to 'user_book_ratings.libsvm'


In the Libsvm file the first column represents the users in ascending order follwed by the ratings given by them to
different books.