In [1]:
import polars as ps
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = ps.read_csv('data/goodreads_interactions.csv')

In [3]:
df.head()

user_id,book_id,is_read,rating,is_reviewed
i64,i64,i64,i64,i64
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0


In [4]:
users = df.select([ps.col('user_id'), ps.col('is_read')]).group_by('user_id').sum()

In [5]:
books = df.select([ps.col('book_id'), ps.col('is_read')]).group_by('book_id').sum()

In [6]:
users.describe()

statistic,user_id,is_read
str,f64,f64
"""count""",876145.0,876145.0
"""null_count""",0.0,0.0
"""mean""",438072.0,127.982472
"""std""",252921.420137,241.612867
"""min""",0.0,0.0
"""25%""",219036.0,19.0
"""50%""",438072.0,54.0
"""75%""",657108.0,142.0
"""max""",876144.0,38895.0


In [7]:
users.head()

user_id,is_read
i64,i64
445813,65
123882,38
565217,20
4070,510
167523,64


75% of users have read 19 or more books. In order to not skew the data toward very heavy readers. Users that have read less than 25 books will be dropped and a random subset of readers than have read 25 or more will be selected.

In [8]:
users = users.filter(ps.col('is_read')>=25)

In [9]:
users.describe()

statistic,user_id,is_read
str,f64,f64
"""count""",610196.0,610196.0
"""null_count""",0.0,0.0
"""mean""",348066.765087,179.893926
"""std""",234447.416438,273.704438
"""min""",0.0,25.0
"""25%""",156351.0,50.0
"""50%""",313556.0,95.0
"""75%""",501091.0,202.0
"""max""",876144.0,38895.0


In [45]:
user_ids = users.select(ps.col('user_id')).sample(n=50000)

In [49]:
user_ids.write_csv('user_subset.csv')

## Books

In [10]:
books.head()

book_id,is_read
i64,i64
1730387,4
2282891,1
751482,8
639286,13
950204,2


In [11]:
books.describe()

statistic,book_id,is_read
str,f64,f64
"""count""",2360650.0,2360650.0
"""null_count""",0.0,0.0
"""mean""",1180324.5,47.500139
"""std""",681461.100819,848.946093
"""min""",0.0,0.0
"""25%""",590162.0,2.0
"""50%""",1180325.0,5.0
"""75%""",1770487.0,14.0
"""max""",2360649.0,285698.0


In [12]:
books = books.filter(ps.col('is_read')>=100)
books.describe()

statistic,book_id,is_read
str,f64,f64
"""count""",125672.0,125672.0
"""null_count""",0.0,0.0
"""mean""",168269.505888,703.613987
"""std""",160942.264963,3616.45257
"""min""",3.0,100.0
"""25%""",50370.0,138.0
"""50%""",118719.0,214.0
"""75%""",240830.0,438.0
"""max""",1510246.0,285698.0


In [53]:
book_ids = books.select(ps.col('book_id')).sample(n=1000)

In [54]:
book_ids

book_id
i64
323461
170046
101
164649
162702
…
742248
415406
223497
6204


In [55]:
book_ids.write_csv('books_subset.csv')

In [13]:
df = df.filter((ps.col('book_id').is_in(books.select(ps.col('book_id')))) & (ps.col('user_id').is_in(users.select(ps.col('user_id')))))

In [60]:
df.write_csv('interactions_subset_large.csv')

In [2]:
df = ps.read_csv('interactions_subset_large.csv')

In [3]:
df = df.select([ps.col(x) for x in ['user_id', 'book_id', 'rating']])

In [4]:
df.head()

user_id,book_id,rating
i64,i64,i64
0,948,5
0,947,5
0,946,5
0,945,5
0,944,5


In [5]:
df.select(ps.col('book_id')).n_unique()

125672

In [6]:
df.select(ps.col('user_id')).n_unique()

610076

In [7]:
df = df.with_columns(ps.col('rating').cast(ps.Int8))

In [8]:
df

user_id,book_id,rating
i64,i64,i8
0,948,5
0,947,5
0,946,5
0,945,5
0,944,5
…,…,…
876144,38802,0
876144,38968,0
876144,23847,4
876144,23950,3


In [9]:
df.describe()

statistic,user_id,book_id,rating
str,f64,f64,f64
"""count""",161666559.0,161666559.0,161666559.0
"""null_count""",0.0,0.0,0.0
"""mean""",245188.905232,62067.768136,1.991295
"""std""",165654.703148,97845.859061,2.09733
"""min""",0.0,3.0,0.0
"""25%""",112520.0,7161.0,0.0
"""50%""",231378.0,21266.0,1.0
"""75%""",354887.0,73911.0,4.0
"""max""",876144.0,1510246.0,5.0


In [10]:
df = df.with_columns([ps.col(column).cast(ps.Int32) for column in ['user_id', 'book_id']])

In [11]:
df.head()

user_id,book_id,rating
i32,i32,i8
0,948,5
0,947,5
0,946,5
0,945,5
0,944,5


In [12]:
df = df.pivot(index='user_id', columns='book_id', values='rating')

In [13]:
import pathlib
path: pathlib.Path = "ratings_subset_large.parquet"
df.write_parquet('ratings_subset_large.parquet')

In [14]:
df.head()

user_id,948,947,946,945,944,943,942,941,940,939,938,937,936,935,933,932,931,930,929,928,927,924,923,921,920,919,918,916,915,914,912,909,908,907,906,905,…,1168157,927527,1181800,1183906,1184277,1184258,1185901,1185862,1188021,1190981,1204475,1215785,1218934,1221111,1223627,1223611,1225459,1226479,1227128,1229072,1234489,1238445,1238830,1240358,1243608,1248448,1274292,1295869,1303636,1307598,1333095,1340190,1352127,1380905,1415581,1451636,1510246
i32,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,…,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8
0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0,2.0,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,0.0,0.0,4.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,4.0,5.0,,5.0,,4.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,5.0,,,,3.0,,3.0,,3.0,3.0,,,,,,,,,,,3.0,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [16]:
df_sparse = df.drop('user_id')

In [17]:
from scipy import sparse

In [18]:
df_sparse.head()

948,947,946,945,944,943,942,941,940,939,938,937,936,935,933,932,931,930,929,928,927,924,923,921,920,919,918,916,915,914,912,909,908,907,906,905,904,…,1168157,927527,1181800,1183906,1184277,1184258,1185901,1185862,1188021,1190981,1204475,1215785,1218934,1221111,1223627,1223611,1225459,1226479,1227128,1229072,1234489,1238445,1238830,1240358,1243608,1248448,1274292,1295869,1303636,1307598,1333095,1340190,1352127,1380905,1415581,1451636,1510246
i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,…,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8,i8
5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0,2.0,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,0.0,0.0,4.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,4.0,5.0,,5.0,,4.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,5.0,,,,3.0,,3.0,,3.0,3.0,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df_sparse = df_sparse.to_numpy()

In [None]:
df_sparse