In [1]:
import numpy as np
import pandas as pd

# Book Crossing
## Read dataset

In [2]:
books_df = pd.read_csv("data/BX-CSV-Dump/BX-Books.csv", sep=";", escapechar='\\', encoding="CP1252", usecols=["ISBN", "Book-Title"])
books_df.index.name = "book_id"
books_df.reset_index(drop=False, inplace=True)
books_df.head()

Unnamed: 0,book_id,ISBN,Book-Title
0,0,195153448,Classical Mythology
1,1,2005018,Clara Callan
2,2,60973129,Decision in Normandy
3,3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,4,393045218,The Mummies of Urumchi


In [3]:
ratings_df = pd.read_csv("data/BX-CSV-Dump/BX-Book-Ratings.csv", sep=";", encoding="CP1252")
ratings_df = ratings_df.merge(books_df, left_on="ISBN", right_on="ISBN")
ratings_df.columns = ["user_id", "isbn", "rating", "book_id", "title"]
ratings_df.drop(columns=["isbn", "title"], inplace=True)
#ratings_df[ratings_df["rating"] == 0] = 5
ratings_df.head()

Unnamed: 0,user_id,rating,book_id
0,276725,0,2966
1,2313,5,2966
2,6543,0,2966
3,8680,5,2966
4,10314,9,2966


In [4]:
print("Nr. of users: %d" % ratings_df["user_id"].nunique())
print("Nr. of books: %d" % ratings_df["book_id"].nunique())
print("Nr. of ratings: %d" % len(ratings_df))
print("Avg. nr. of ratings per user: %.4f" % ratings_df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per book: %.4f" % ratings_df.groupby("book_id").size().mean())
print("Sparsity: %f%%" % (100 * (1 - len(ratings_df) / (ratings_df["user_id"].nunique() * ratings_df["book_id"].nunique()))))

Nr. of users: 92107
Nr. of books: 270170
Nr. of ratings: 1031175
Avg. nr. of ratings per user: 11.1954
Avg. nr. of ratings per book: 3.8168
Sparsity: 99.995856%


# Sampling

In [10]:
#book_popularity_df = ratings_df.groupby("book_id").size().sort_values(ascending=False).head(15000)
#books_sample = book_popularity_df[book_popularity_df > 20].index.tolist()
#sample_df = ratings_df[ratings_df["book_id"].isin(books_sample)]
book_popularity_df = ratings_df.groupby("book_id").size().sort_values(ascending=False)
sample_df = ratings_df[ratings_df["book_id"].isin(book_popularity_df.head(20000).index.tolist())]
print("Nr. of users: %d" % sample_df["user_id"].nunique())
print("Nr. of books: %d" % sample_df["book_id"].nunique())
print("Nr. of ratings: %d" % len(sample_df))
print("Avg. nr. of ratings per user: %.4f" % sample_df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per book: %.4f" % sample_df.groupby("book_id").size().mean())
print("Sparsity: %f%%" % (100 * (1 - len(sample_df) / (sample_df["user_id"].nunique() * sample_df["book_id"].nunique()))))

Nr. of users: 66538
Nr. of books: 20000
Nr. of ratings: 543087
Avg. nr. of ratings per user: 8.1621
Avg. nr. of ratings per book: 27.1544
Sparsity: 99.959190%


In [6]:
sample_df[["user_id", "book_id", "rating"]].to_csv("data/bx_sample.csv", index=False, sep=";")

In [7]:
np.random.seed(0)
user_sample = np.random.choice(ratings_df["user_id"].unique(), size=5000, replace=False)
sample_df = ratings_df[ratings_df["user_id"].isin(user_sample)]

In [8]:
print("Nr. of users: %d" % sample_df["user_id"].nunique())
print("Nr. of books: %d" % sample_df["book_id"].nunique())
print("Nr. of ratings: %d" % len(sample_df))
print("Avg. nr. of ratings per user: %.4f" % sample_df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per book: %.4f" % sample_df.groupby("book_id").size().mean())
print("Sparsity: %f%%" % (100 * (1 - len(sample_df) / (sample_df["user_id"].nunique() * sample_df["book_id"].nunique()))))

Nr. of users: 5000
Nr. of books: 33841
Nr. of ratings: 50857
Avg. nr. of ratings per user: 10.1714
Avg. nr. of ratings per book: 1.5028
Sparsity: 99.969944%


In [None]:
sample_df[["user_id", "book_id", "rating"]].to_csv("data/bx_sample.csv", index=False, sep=";")

In [None]:
pd.read_csv("data/bx_sample.csv", sep=";")

# Anime Small

In [3]:
df = pd.read_csv("data/anime_small/rating.csv")
df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
len(df), df["user_id"].nunique(), df["anime_id"].nunique()

(7813737, 73515, 11200)

In [9]:
len(df[df["rating"] != -1]), df[df["rating"] != -1]["user_id"].nunique(), df[df["rating"] != -1]["anime_id"].nunique()

(6337241, 69600, 9927)

In [11]:
df[df["rating"] != -1].to_csv("data/anime_small.csv", sep=";", header=False, index=False)