In [1]:
import os
import random

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
raw_dir = './raw/'
pp_dir = './preprocessed/'

train_p = 0.7
val_p = 0.1
test_p = 0.2
random_state = 0

In [3]:
assert (train_p + val_p + test_p) == 1.0

## Load raw interactions

In [4]:
int_train = pd.read_csv(os.path.join(raw_dir, 'interactions_train.csv'))
int_validation = pd.read_csv(os.path.join(raw_dir, 'interactions_validation.csv'))
int_test = pd.read_csv(os.path.join(raw_dir, 'interactions_test.csv'))

In [5]:
len(int_train), len(int_validation), len(int_test)

(698901, 7023, 12455)

In [6]:
# Join the original train/validation/test splits to perform filtering
int_all = pd.concat([
    int_train,
    int_validation,
    int_test
])

In [7]:
# What does the data look like?
int_all.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [8]:
# How many unique users and items are in the dataset?
int_all[['u', 'i']].nunique()

u     25076
i    178265
dtype: int64

In [9]:
# Confirm that there are no duplicate interactions
int_all.shape[0], int_all[['u', 'i']].drop_duplicates().shape[0]

(718379, 718379)

## Sample 5-core interaction set

In [10]:
# How many users review at least 5 recipes?
(int_all.groupby('u')['i'].count().values >= 5).sum()

16440

In [11]:
# How many recipes have at least 5 reviews?
(int_all.groupby('i')['u'].count().values >= 5).sum()

34356

In [12]:
# How many reviews have a rating above 3? (i.e. 4 or 5)
int_all[int_all['rating'] > 3].shape[0]

662263

In [13]:
# Subsample the 5-core interaction set (retain users and recipes with 
# at least 5 interactions above a 3 rating)
int_core = int_all.merge(
    int_all[int_all.rating > 3].groupby('u')['i'].count().rename('num_i'),
    how='left',
    on='u'
).merge(
    int_all[int_all.rating > 3].groupby('i')['u'].count().rename('num_u'),
    how='left',
    on='i'
)
int_core = int_core[
    (int_core.num_i >= 5) & (int_core.num_u >= 5) & (int_core.rating > 3)
]
int_core.shape[0]

410515

In [14]:
# How many unique users and recipes are in the 5-core interaction set?
int_core[['u', 'i']].nunique()

u    15134
i    31895
dtype: int64

## Split into train/validation/test sets

In [15]:
# Reindex u and i indices
le_user = LabelEncoder()
le_recipe = LabelEncoder()

int_core.u = le_user.fit_transform(int_core.user_id)
int_core.i = le_recipe.fit_transform(int_core.recipe_id)
int_core = int_core.drop(['num_i', 'num_u'], 1).reset_index()

In [16]:
# Perform train/validate/test split
train_df, val_df, test_df = np.split(
    int_core.sample(frac=1, replace=False, random_state=random_state), 
    [int(train_p * len(int_core)), int((train_p + val_p) * len(int_core))]
)
train_df.shape[0], val_df.shape[0], test_df.shape[0]

(287360, 41052, 82103)

In [17]:
train_df.to_csv(os.path.join(pp_dir, 'train.csv'))
val_df.to_csv(os.path.join(pp_dir, 'val.csv'))
test_df.to_csv(os.path.join(pp_dir, 'test.csv'))

## Extract recipe names

In [18]:
raw_recipes = pd.read_csv(os.path.join(raw_dir, 'RAW_recipes.csv'))
recipe_ids = int_core['recipe_id'].unique()

In [19]:
recipe_names = raw_recipes[raw_recipes['id'].isin(recipe_ids)][['id', 'name']]
recipe_names.to_csv(os.path.join(pp_dir, 'recipe_names.csv'))