In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Read Data

In [6]:
df1 = pd.read_excel("jester-data-1.xls", header=None)
df1.drop(columns=[0], inplace=True)
df1 = df1.stack().reset_index()
df1.columns = ["user_id", "item_id", "rating"]

df2 = pd.read_excel("jester-data-2.xls", header=None)
df2.drop(columns=[0], inplace=True)
df2 = df2.stack().reset_index()
df2.columns = ["user_id", "item_id", "rating"]

df3 = pd.read_excel("jester-data-3.xls", header=None)
df3.drop(columns=[0], inplace=True)
df3 = df3.stack().reset_index()
df3.columns = ["user_id", "item_id", "rating"]

# Construct Rating Data 

In [7]:
n_users_1 = df1["user_id"].nunique()
n_users_2 = df2["user_id"].nunique()
n_users_3 = df3["user_id"].nunique()

user_mapping_1 = {b: a for a, b in enumerate(range(n_users_1))}
user_mapping_2 = {b: a+n_users_1 for a, b in enumerate(range(n_users_2))}
user_mapping_3 = {b: a+n_users_1+n_users_2 for a, b in enumerate(range(n_users_3))}

df1["user_id"] = df1["user_id"].map(user_mapping_1)
df2["user_id"] = df2["user_id"].map(user_mapping_2)
df3["user_id"] = df3["user_id"].map(user_mapping_3)
df = df1.append(df2).append(df3)

df["item_id"] = df["item_id"].apply(lambda v: v-1)
df.astype({"user_id": int, "item_id": int})

Unnamed: 0,user_id,item_id,rating
0,0,0,-7.82
1,0,1,8.79
2,0,2,-9.66
3,0,3,-8.16
4,0,4,-7.52
5,0,5,-8.50
6,0,6,-9.85
7,0,7,4.17
8,0,8,-8.98
9,0,9,-4.76


In [8]:
df["rating"].replace(99, np.nan, inplace=True)
df.dropna(inplace=True)

ratings = []
for _, group_df in df.groupby("user_id"):
    scaled_ratings = MinMaxScaler(feature_range=(1, 5)).fit_transform(group_df["rating"].values.reshape(-1, 1))
    ratings_per_user_df = group_df.copy()
    ratings_per_user_df["rating"] = scaled_ratings
    ratings.extend(ratings_per_user_df.to_records(index=False).tolist())
    
ratings_df = pd.DataFrame.from_records(ratings, columns=["user_id", "item_id", "rating"])
ratings_df.head()

Unnamed: 0,user_id,item_id,rating
0,0,0,1.427819
1,0,1,4.928346
2,0,2,1.040042
3,0,3,1.356164
4,0,4,1.491043


# Save Datasets

In [9]:
n = len(ratings_df)
n_val = int(np.ceil(n * 0.1))
n_test = n_val
n_train = n - n_val - n_test

train_df, valtest_df = train_test_split(ratings_df, test_size=n_val+n_test)
val_df, test_df = train_test_split(valtest_df, test_size=n_test)

train_df.to_csv("../jester.train.rating", sep="\t", index=False, header=False)
val_df.to_csv("../jester.valid.rating", sep="\t", index=False, header=False)
test_df.to_csv("../jester.test.rating", sep="\t", index=False, header=False)

pd.DataFrame(ratings_df["user_id"].unique()).to_csv("../jester.userlist", index=False, header=None)
pd.DataFrame(ratings_df["item_id"].unique()).to_csv("../jester.itemlist", index=False, header=None)

In [16]:
print("[Jester] number of users: %d, avg. number of ratings per user: %f" % (n, ratings_df.groupby("user_id").size().mean()))

[Jester] number of users: 4136360, avg. number of ratings per user: 56.337560
