In [1]:
import numpy as np
import pandas as pd

# Book Crossing
## Read dataset

In [2]:
books_df = pd.read_csv("data/BX-CSV-Dump/BX-Books.csv", sep=";", escapechar='\\', encoding="CP1252", usecols=["ISBN", "Book-Title"])
books_df.index.name = "book_id"
books_df.reset_index(drop=False, inplace=True)
books_df.head()

Unnamed: 0,book_id,ISBN,Book-Title
0,0,195153448,Classical Mythology
1,1,2005018,Clara Callan
2,2,60973129,Decision in Normandy
3,3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,4,393045218,The Mummies of Urumchi


In [19]:
ratings_df = pd.read_csv("data/BX-CSV-Dump/BX-Book-Ratings.csv", sep=";", encoding="CP1252")
ratings_df = ratings_df.merge(books_df, left_on="ISBN", right_on="ISBN")
ratings_df.columns = ["user_id", "isbn", "rating", "book_id", "title"]
ratings_df.drop(columns=["isbn", "title"], inplace=True)
ratings_df = ratings_df[ratings_df["rating"] > 0]
ratings_df.head()

Unnamed: 0,user_id,rating,book_id
1,2313,5,2966
3,8680,5,2966
4,10314,9,2966
9,50403,9,2966
13,63970,8,2966


In [20]:
print("Nr. of users: %d" % ratings_df["user_id"].nunique())
print("Nr. of books: %d" % ratings_df["book_id"].nunique())
print("Nr. of ratings: %d" % len(ratings_df))
print("Avg. nr. of ratings per user: %.4f" % ratings_df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per book: %.4f" % ratings_df.groupby("book_id").size().mean())
print("Sparsity: %f%%" % (100 * (1 - len(ratings_df) / (ratings_df["user_id"].nunique() * ratings_df["book_id"].nunique()))))

Nr. of users: 68092
Nr. of books: 149842
Nr. of ratings: 383852
Avg. nr. of ratings per user: 5.6373
Avg. nr. of ratings per book: 2.5617
Sparsity: 99.996238%


## Sampling

In [21]:
np.random.seed(0)
user_sample = np.random.choice(ratings_df["user_id"].unique(), size=4100, replace=False)
sample_df = ratings_df[ratings_df["user_id"].isin(user_sample)]

In [22]:
print("Nr. of users: %d" % sample_df["user_id"].nunique())
print("Nr. of books: %d" % sample_df["book_id"].nunique())
print("Nr. of ratings: %d" % len(sample_df))
print("Avg. nr. of ratings per user: %.4f" % sample_df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per book: %.4f" % sample_df.groupby("book_id").size().mean())
print("Sparsity: %f%%" % (100 * (1 - len(sample_df) / (sample_df["user_id"].nunique() * sample_df["book_id"].nunique()))))

Nr. of users: 4100
Nr. of books: 15631
Nr. of ratings: 20554
Avg. nr. of ratings per user: 5.0132
Avg. nr. of ratings per book: 1.3150
Sparsity: 99.967928%


In [23]:
sample_df = sample_df[["user_id", "book_id", "rating"]]
sample_df.to_csv("data/bx_sample.csv", index=False, header=None, sep=";")

# Xing
## Read dataset

In [24]:
items_df = pd.read_csv("data/Xing Dataset/items.csv", sep="\t", usecols=["item_id", "region", "title", "country"])
items_df.head()

Unnamed: 0,item_id,title,country,region
0,30,,de,7
1,70,29943006657629019381276553680343,de,1
2,80,32757011572284179835,de,1
3,90,2343474,de,3
4,100,2003688,at,0


In [26]:
smallest_region = items_df.groupby("region").size().sort_values().head(1).index.values[0]
items_df[items_df["region"] == smallest_region]

Unnamed: 0,item_id,title,country,region
309,5380,370690340851773073497,de,16
1016,17070,"1617655,1240666,800184,2635587,1940275,3572057...",de,16
1984,34500,"1090675,427470,157228,2188313,2398283,2038807,...",de,16
2274,39540,36148104330980271482221681661348641,de,16
2377,41340,12483581753029,de,16
...,...,...,...,...
1304822,2262648,1300462,de,16
1304893,2263948,1915442431305818681891041753,de,16
1304899,2264068,45858221323321,de,16
1305958,2283038,,de,16


In [28]:
largest_region = items_df.groupby("region").size().sort_values().tail(1).index.values[0]
items_df[items_df["region"] == largest_region]

Unnamed: 0,item_id,title,country,region
4,100,2003688,at,0
6,130,13858851196844,de,0
7,140,3739444294560132460514812840981623876666,ch,0
8,150,73256728532443105892304652,de,0
15,250,508564391283,ch,0
...,...,...,...,...
1306009,2283988,3859656210311443479662180632665762,de,0
1306040,2284518,14113093723110,at,0
1306044,2284618,42868738416151626234,de,0
1306050,2284728,4460049,de,0


In [29]:
users_df = pd.read_csv("data/Xing Dataset/users.csv", sep="\t", usecols=["recsyschallenge_v2017_users_final_anonym_export_unique.id", "recsyschallenge_v2017_users_final_anonym_export_unique.region"])
users_df.columns = ["user_id", "region_id"]
users_df.head()

Unnamed: 0,user_id,region_id
0,30,0
1,50,7
2,70,2
3,90,0
4,100,0


In [30]:
U = np.random.choice(users_df["user_id"].unique(), size=1527, replace=False)
U

array([1339913,  428396, 1837816, ..., 2178206, 2149304,  418920],
      dtype=int64)

In [31]:
interactions14_df = pd.read_csv("data/Xing Dataset/interactions_14.csv", sep="\t")
interactions5_df = pd.read_csv("data/Xing Dataset/interactions_5.csv", sep="\t")
interactions_df = interactions5_df.append(interactions14_df)
interactions_df.head()

Unnamed: 0,user_id,item_id,interaction_type,created_at
0,252360,4020,5,1485444245
1,709236,9860,5,1484906324
2,709236,9860,5,1484906324
3,709236,9860,5,1484906324
4,709236,9860,5,1484906324


In [32]:
interactions_df[interactions_df["user_id"].isin(U)]

Unnamed: 0,user_id,item_id,interaction_type,created_at
3120,1766761,859160,5,1485354563
3121,1766761,859160,5,1485354563
3122,1766761,859160,5,1485354563
3123,1766761,859160,5,1485354563
3124,1766761,859160,5,1485354563
...,...,...,...,...
8171640,884568,2278439,4,1483518645
8171901,2283621,2278439,4,1483605645
8172870,2321131,2280599,1,1484675198
8173013,1089403,2280599,4,1484586329
