In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Douban

In [3]:
db_train_df = pd.read_csv("data/douban/db.train.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])
db_val_df = pd.read_csv("data/douban/db.valid.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])
db_test_df = pd.read_csv("data/douban/db.test.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])

douban_df = db_train_df.append(db_val_df).append(db_test_df)
douban_df.head()

Unnamed: 0,user_id,item_id,rating
0,0,99,4.0
1,0,16,5.0
2,0,142,4.0
3,0,120,4.0
4,0,135,5.0


In [4]:
print("Nr. of ratings: %d" % len(douban_df))
print("Nr. of users: %d" % douban_df["user_id"].nunique())
print("Nr. of items: %d" % douban_df["item_id"].nunique())
print("Density: %f" % (len(douban_df) / (douban_df["user_id"].nunique() * douban_df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % douban_df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % douban_df.groupby("item_id").size().mean())

Nr. of ratings: 893575
Nr. of users: 2509
Nr. of items: 39576
Density: 0.008999
Avg. nr. of ratings per user: 356.147868
Avg. nr. of ratings per item: 22.578709


In [None]:
df.to_csv("data/douban/sample.csv", sep=";", header=None, index=False)

# Ciao

In [5]:
ciao_train_df = pd.read_csv("data/ciao/ciao.train.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])
ciao_val_df = pd.read_csv("data/ciao/ciao.valid.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])
ciao_test_df = pd.read_csv("data/ciao/ciao.test.rating", sep="\t", header=None, names=["user_id", "item_id", "rating"])

ciao_df = ciao_train_df.append(ciao_val_df).append(ciao_test_df)
ciao_df.head()

Unnamed: 0,user_id,item_id,rating
0,0,670,5.0
1,0,159,5.0
2,0,120,4.0
3,0,75,4.0
4,0,427,4.0


In [6]:
print("Nr. of ratings: %d" % len(ciao_df))
print("Nr. of users: %d" % ciao_df["user_id"].nunique())
print("Nr. of items: %d" % ciao_df["item_id"].nunique())
print("Density: %f" % (len(ciao_df) / (ciao_df["user_id"].nunique() * ciao_df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % ciao_df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % ciao_df.groupby("item_id").size().mean())

Nr. of ratings: 282619
Nr. of users: 7375
Nr. of items: 105096
Density: 0.000365
Avg. nr. of ratings per user: 38.321220
Avg. nr. of ratings per item: 2.689151


In [None]:
df.to_csv("data/ciao/sample.csv", sep=";", header=None, index=False)

# Goodreads

In [19]:
df = pd.read_csv("data/goodreads/goodreads_interactions.csv", sep=",", usecols=["user_id", "book_id", "rating"])
df.columns = ["user_id", "item_id", "rating"]
df.head()

Unnamed: 0,user_id,item_id,rating
0,0,948,5
1,0,947,5
2,0,946,5
3,0,945,5
4,0,944,5


In [20]:
df = df[df["rating"] != 0]

In [21]:
user_sample = np.random.choice(df["user_id"].unique(), size=20000, replace=False)
df = df[df["user_id"].isin(user_sample)]
print("Nr. of ratings: %d" % len(df))
print("Nr. of users: %d" % df["user_id"].nunique())
print("Nr. of items: %d" % df["item_id"].nunique())
print("Density: %f" % (len(df) / (df["user_id"].nunique() * df["item_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % df.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % df.groupby("item_id").size().mean())

Nr. of ratings: 2535555
Nr. of users: 20000
Nr. of items: 509246
Density: 0.000249
Avg. nr. of ratings per user: 126.777750
Avg. nr. of ratings per item: 4.979038


In [6]:
df.to_csv("data/goodreads/sample.csv", sep=";", header=None, index=False)

# LFM User Groups

In [7]:
df = pd.read_csv("data/lfm/user_events.txt", sep="\t", header=None, names=["user_id", "artist_id", "album_id", "track_id", "timestamp"])
df.head()

Unnamed: 0,user_id,artist_id,album_id,track_id,timestamp
0,31435741,2,4,4,1385212958
1,31435741,2,4,4,1385212642
2,31435741,2,4,4,1385212325
3,31435741,2,4,4,1385209508
4,31435741,2,4,4,1385209191


In [9]:
df_ = df.groupby(["user_id", "artist_id"]).size().reset_index(name='count')

scaled_df_events = pd.DataFrame()
for user_id, group in df_.groupby("user_id"):
    min_rating = group['count'].min()
    max_rating = group['count'].max()
    scaler = MinMaxScaler(feature_range=(1, 1000))
    scaled_ratings = scaler.fit_transform(group['count'].values.reshape(-1, 1).astype(float))
    new_rows = group.copy()
    new_rows['count'] = scaled_ratings
    scaled_df_events = scaled_df_events.append(new_rows)

print("Nr. of ratings: %d" % len(scaled_df_events))
print("Nr. of users: %d" % scaled_df_events["user_id"].nunique())
print("Nr. of items: %d" % scaled_df_events["artist_id"].nunique())
print("Density: %f" % (len(scaled_df_events) / (scaled_df_events["user_id"].nunique() * scaled_df_events["artist_id"].nunique())))
print("Avg. nr. of ratings per user: %f" % scaled_df_events.groupby("user_id").size().mean())
print("Avg. nr. of ratings per item: %f" % scaled_df_events.groupby("artist_id").size().mean())

scaled_df_events.columns = ["user_id", "item_id", "rating"]
scaled_df_events.to_csv("data/lfm/artist_ratings.csv", sep=";", index=False, header=None)

Nr. of ratings: 1755361
Nr. of users: 3000
Nr. of items: 352805
Density: 0.001658
Avg. nr. of ratings per user: 585.120333
Avg. nr. of ratings per item: 4.975443
