In [None]:
import pandas as pd
from scipy.sparse import coo_matrix
from sklearn.datasets import dump_svmlight_file

# --- Configuration ---
RAW_DATA_DIR = "../data/raw/"
PROCESSED_DATA_DIR = "../data/processed/"
OUTPUT_FILE = "user_item_rating_matrix.libsvm"

# --- Load Data ---
print("Loading raw data...\n")
# Use error handling for robust parsing
books_df = pd.read_csv(f"{RAW_DATA_DIR}Books.csv", delimiter=";", low_memory=False, on_bad_lines='skip', encoding='latin-1')
users_df = pd.read_csv(f"{RAW_DATA_DIR}Users.csv", delimiter=";", low_memory=False, on_bad_lines='skip', encoding='latin-1')
ratings_df = pd.read_csv(f"{RAW_DATA_DIR}Ratings.csv", delimiter=";", low_memory=False, on_bad_lines='skip', encoding='latin-1')


print("First 5 rows of Books")
display(books_df.head())

print("First 5 rows of Users")
display(users_df.head())

print("First 5 rows of Ratings")
display(ratings_df.head())

Loading raw data...

First 5 rows of Books


Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


First 5 rows of Users


Unnamed: 0,User-ID,Age
0,1,
1,2,18.0
2,3,
3,4,17.0
4,5,


First 5 rows of Ratings


Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [20]:
# --- Preprocessing ---
print("Starting preprocessing...")

# Rename columns for clarity and consistency
ratings_df.columns = ['UserID', 'ISBN', 'Rating']

ratings_df.head()

Starting preprocessing...


Unnamed: 0,UserID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [9]:
# Create unique, zero-based index mappings for users and books that appear in the ratings
user_to_index = {user_id: idx for idx, user_id in enumerate(ratings_df['UserID'].unique())}
book_to_index = {isbn: idx for idx, isbn in enumerate(ratings_df['ISBN'].unique())}

# Apply the new indexes to the ratings dataframe
ratings_df['UserIndex'] = ratings_df['UserID'].map(user_to_index)
ratings_df['BookIndex'] = ratings_df['ISBN'].map(book_to_index)

# Construct the sparse matrix from the indexes and ratings
# This is a memory-efficient way to store the user-item interactions
sparse_matrix = coo_matrix((ratings_df['Rating'], (ratings_df['UserIndex'], ratings_df['BookIndex'])))

print(f"Sparse matrix created with shape: {sparse_matrix.shape}")


Sparse matrix created with shape: (105283, 340556)


In [23]:
# --- Save Processed Data ---
print(f"Saving processed data to {PROCESSED_DATA_DIR}{OUTPUT_FILE}...")
dump_svmlight_file(sparse_matrix, [0] * sparse_matrix.shape[0], f"{PROCESSED_DATA_DIR}{OUTPUT_FILE}")

print("\nPreprocessing complete!")

Saving processed data to ../data/processed/user_item_rating_matrix.libsvm...

Preprocessing complete!
