In [None]:
import pandas as pd
import csv
from scipy.sparse import coo_matrix


In [None]:
# Step 1: Load the CSV files
ratings_df = pd.read_csv("Ratings.csv", sep=";")
users_df = pd.read_csv("Users.csv", sep=";", low_memory=False)
books_df = pd.read_csv("Books.csv", delimiter=";")
ratings_df


In [None]:
users_df

In [None]:
books_df

In [None]:
#Data Cleaning. We'll need to check if there are any missing values for all of them. If yes, we'll fill them with some values
ratings_df.isna().sum()

In [None]:
users_df.isna().sum()

In [None]:
users_df["Age"] = users_df["Age"].apply(pd.to_numeric, errors='coerce')
#Now again check the count of the NaN values
users_df.isna().sum()

In [None]:
users_df["Age"].fillna(users_df["Age"].mean(), inplace=True)
users_df.isna().sum()

In [None]:
#Ensuring that the Age is Numeric
users_df["Age"] = users_df["Age"].astype('int')
books_df.isna().sum()

In [None]:
books_df["Author"].fillna("Unknown Author",inplace=True)
books_df["Publisher"].fillna("Unknown Publisher",inplace=True)
books_df.isna().sum()

In [None]:
#Checking if there are any duplicate values in the dataframes
users_df.duplicated().any()
books_df.duplicated().any()
ratings_df.duplicated().any()

In [None]:
#Dropping the duplicate value from the books_df
books_df.drop_duplicates(inplace=True)

In [None]:
#Creating a mapping for each userids and isbnsto numeric indices
user_map = {user_id:idx for idx,user_id in enumerate(ratings_df["User-ID"].unique())}
book_map = {isbn:idx for idx, isbn in enumerate(ratings_df["ISBN"].unique())}
#Now mapping the Userids and isbns to indices
ratings_df["user_indx"] = ratings_df["User-ID"].map(user_map)
ratings_df["book_indx"] = ratings_df["ISBN"].map(book_map)

In [None]:
ratings_df['user_indx']

In [None]:
ratings_df['book_indx']

In [None]:
sparse_matrix = coo_matrix(
    (ratings_df["Rating"], (ratings_df["user_indx"], ratings_df["book_indx"])),
    shape=(len(user_map), len(book_map)),
)
print(sparse_matrix)

In [None]:
# Writing the sparse matrix in libsvm format
pd.DataFrame.sparse.from_spmatrix(sparse_matrix)
with open("output.libsvm", "w") as file:
    for user_indx in range(sparse_matrix.shape[0]):
        # Get all non-zero ratings for the user
        row = sparse_matrix.getrow(user_indx).tocoo()
        ratings = [f"{book_indx+1}:{rating}" for book_indx, rating in zip(row.col, row.data)]
        # Write the line (e.g., "0 1:5 3:10")
        file.write(f"{user_indx} " + " ".join(ratings) + "\n")

print("Libsvm file has been created successfully: output.libsvm")