In [1]:
import pandas as pd
import csv
from scipy.sparse import coo_matrix


In [None]:
# Step 1: Load the CSV files
ratings_df = pd.read_csv("Ratings.csv", sep=";")
users_df = pd.read_csv("Users.csv", sep=";", low_memory=False)
books_df = pd.read_csv("Books.csv", delimiter=";")
ratings_df


Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [3]:
users_df

Unnamed: 0,User-ID,Age
0,1,
1,2,18
2,3,
3,4,17
4,5,
...,...,...
278854,278854,
278855,278855,50
278856,278856,
278857,278857,


In [4]:
books_df

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company
...,...,...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271375,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271377,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [5]:
#Data Cleaning. We'll need to check if there are any missing values for all of them. If yes, we'll fill them with some values
ratings_df.isna().sum()

User-ID    0
ISBN       0
Rating     0
dtype: int64

In [6]:
users_df.isna().sum()

User-ID         0
Age        110232
dtype: int64

In [7]:
users_df["Age"] = users_df["Age"].apply(pd.to_numeric, errors='coerce')
#Now again check the count of the NaN values
users_df.isna().sum()

User-ID         0
Age        111708
dtype: int64

In [8]:
users_df["Age"].fillna(users_df["Age"].mean(), inplace=True)
users_df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  users_df["Age"].fillna(users_df["Age"].mean(), inplace=True)


User-ID    0
Age        0
dtype: int64

In [9]:
#Ensuring that the Age is Numeric
users_df["Age"] = users_df["Age"].astype('int')
books_df.isna().sum()

ISBN         0
Title        0
Author       2
Year         0
Publisher    2
dtype: int64

In [10]:
books_df["Author"].fillna("Unknown Author",inplace=True)
books_df["Publisher"].fillna("Unknown Publisher",inplace=True)
books_df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books_df["Author"].fillna("Unknown Author",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  books_df["Publisher"].fillna("Unknown Publisher",inplace=True)


ISBN         0
Title        0
Author       0
Year         0
Publisher    0
dtype: int64

In [11]:
#Checking if there are any duplicate values in the dataframes
users_df.duplicated().any()
books_df.duplicated().any()
ratings_df.duplicated().any()

False

In [12]:
#Dropping the duplicate value from the books_df
books_df.drop_duplicates(inplace=True)

In [13]:
#Creating a mapping for each userids and isbnsto numeric indices
user_map = {user_id:idx for idx,user_id in enumerate(ratings_df["User-ID"].unique())}
book_map = {isbn:idx for idx, isbn in enumerate(ratings_df["ISBN"].unique())}
#Now mapping the Userids and isbns to indices
ratings_df["user_indx"] = ratings_df["User-ID"].map(user_map)
ratings_df["book_indx"] = ratings_df["ISBN"].map(book_map)

In [14]:
ratings_df['user_indx']

0               0
1               1
2               2
3               3
4               3
            ...  
1149775    105278
1149776    105279
1149777    105280
1149778    105281
1149779    105282
Name: user_indx, Length: 1149780, dtype: int64

In [15]:
ratings_df['book_indx']

0               0
1               1
2               2
3               3
4               4
            ...  
1149775    226347
1149776      7295
1149777     12065
1149778     78598
1149779    340555
Name: book_indx, Length: 1149780, dtype: int64

In [16]:
sparse_matrix = coo_matrix(
    (ratings_df["Rating"], (ratings_df["user_indx"], ratings_df["book_indx"])),
    shape=(len(user_map), len(book_map)),
)
print(sparse_matrix)

<COOrdinate sparse matrix of dtype 'int64'
	with 1149780 stored elements and shape (105283, 340556)>
  Coords	Values
  (0, 0)	0
  (1, 1)	5
  (2, 2)	0
  (3, 3)	3
  (3, 4)	6
  (4, 5)	0
  (5, 6)	8
  (6, 7)	6
  (7, 8)	7
  (8, 9)	10
  (9, 10)	0
  (9, 11)	0
  (9, 12)	0
  (9, 13)	0
  (9, 14)	0
  (9, 15)	0
  (10, 16)	9
  (10, 17)	0
  (10, 18)	0
  (10, 19)	9
  (10, 20)	8
  (10, 21)	7
  (10, 22)	0
  (10, 23)	7
  (11, 24)	6
  :	:
  (105276, 109199)	0
  (105276, 255395)	0
  (105276, 340552)	0
  (105277, 146362)	0
  (105278, 50527)	0
  (105278, 34599)	0
  (105278, 3111)	6
  (105278, 50534)	5
  (105278, 56824)	0
  (105278, 12047)	0
  (105278, 26864)	0
  (105278, 20167)	0
  (105278, 46617)	0
  (105278, 8545)	0
  (105278, 195285)	0
  (105278, 340553)	0
  (105278, 8851)	7
  (105278, 2104)	0
  (105278, 340554)	5
  (105278, 284681)	0
  (105278, 226347)	9
  (105279, 7295)	0
  (105280, 12065)	10
  (105281, 78598)	10
  (105282, 340555)	8


In [None]:
# Writing the sparse matrix in libsvm format
pd.DataFrame.sparse.from_spmatrix(sparse_matrix)
with open("output.libsvm", "w") as file:
    for user_indx in range(sparse_matrix.shape[0]):
        # Get all non-zero ratings for the user
        row = sparse_matrix.getrow(user_indx).tocoo()
        ratings = [f"{book_indx+1}:{rating}" for book_indx, rating in zip(row.col, row.data)]
        # Write the line (e.g., "0 1:5 3:10")
        file.write(f"{user_indx} " + " ".join(ratings) + "\n")

print("Libsvm file has been created successfully: output.libsvm")