In [12]:
import os
import pandas as pd
import numpy as np
import collections
import json


In [66]:
## Choose Dataset to Extract Feautres

# data_to_preprocess = 'amazon_video_games'
# data_to_preprocess = 'amazon_movies'
data_to_preprocess = 'movielens1m'

In [67]:

# # # # # Amazon Video Games
if data_to_preprocess == 'amazon_video_games':
    ratings_df = pd.read_json('data/reviews_Video_Games.json', lines=True)
    data_path = 'data/amazon_video_games'
    user_column = 'reviewerID'
    item_column = 'asin'
    rating_column = 'overall'

# # # # # Amazon Tv & Movies
if data_to_preprocess == 'amazon_movies':
    ratings_df = pd.read_json('data/reviews_Movies_and_TV.json', lines=True)
    data_path = 'data/amazon_movies'
    user_column = 'reviewerID'
    item_column = 'asin'
    rating_column = 'overall'
    
# # # # # MovieLense 1m
if data_to_preprocess == 'movielens1m':
    ratings_df = pd.read_csv('data/ml-1m/ratings.dat', sep='::', names=['user', 'item', 'rating', 'time'])
    data_path = 'data/ml-1m'
    user_column = 'user'
    item_column = 'item'
    rating_column = 'rating'



In [68]:
if not os.path.isdir(data_path):
    os.mkdir(data_path)

In [69]:
def filter_ratings_df(min_interactions, ratings_df, key_column):
    
    filtered_df = []
    for _, df in ratings_df.groupby(key_column):
        if len(df) < min_interactions:
            continue
        filtered_df.append(df)
    ratings_df = pd.concat(filtered_df)
    return ratings_df


if data_to_preprocess == 'amazon_movies' or data_to_preprocess == 'amazon_video_games': 
    ratings_df = filter_ratings_df(10, ratings_df, user_column)
    ratings_df = filter_ratings_df(7, ratings_df, item_column)

## Create ids to indeces map

In [70]:
unique_user_ids = ratings_df[user_column].unique()
unique_item_ids = ratings_df[item_column].unique()

users_map = {}
for idx, user_id in enumerate(unique_user_ids):
    users_map[user_id] = idx

items_map = {}
for idx, item_id in enumerate(unique_item_ids):
    items_map[item_id] = idx
    
users_map = {str(k):v for k,v in users_map.items()}
items_map = {str(k):v for k,v in items_map.items()}

## Split to train-test

In [71]:
def split_train_test(key_column):
    train_df = []
    test_df = []
    for key, key_df in ratings_df.groupby(key_column):
        key_test_df = key_df.sample(frac=0.1, random_state=3)
        key_train_df = key_df.loc[~key_df.index.isin(key_test_df.index)]

        train_df.append(key_train_df)
        test_df.append(key_test_df)

    train_df = pd.concat(train_df)
    test_df = pd.concat(test_df)
    return train_df, test_df

u_train_ratings_df, u_test_ratings_df = split_train_test(key_column=user_column)
i_train_ratings_df, i_test_ratings_df = split_train_test(key_column=item_column)

### Create Numpy ratings table

In [72]:
n_users = len(unique_user_ids)
n_items = len(unique_item_ids)

def create_table(ratings_df):
    ratings_table = np.zeros([n_users, n_items])
    for user, user_df in ratings_df.groupby([user_column]):
        user_idx = users_map[str(user)]
        for _, item_row in user_df.iterrows():
            item_idx = items_map[str(item_row[item_column])]
            ratings_table[user_idx, item_idx] = item_row[rating_column]
    return ratings_table

users_ratings_table_train = create_table(u_train_ratings_df)
items_ratings_table_train = create_table(i_train_ratings_df)
items_ratings_table_train = items_ratings_table_train.transpose()

users_ratings_table_test = create_table(u_test_ratings_df)
items_ratings_table_test = create_table(i_test_ratings_df)
items_ratings_table_test = items_ratings_table_test.transpose()


### Save Data for model use

In [73]:

np.save(f'{data_path}/users_ratings_table_train.npy', users_ratings_table_train)
np.save(f'{data_path}/users_ratings_table_test.npy', users_ratings_table_test)

np.save(f'{data_path}/items_ratings_table_train.npy', items_ratings_table_train)
np.save(f'{data_path}/items_ratings_table_test.npy', items_ratings_table_test)

u_train_ratings_df.to_csv(f'{data_path}/u_train_df.csv' ,index=False)
i_train_ratings_df.to_csv(f'{data_path}/i_train_df.csv' ,index=False)


with open(f'{data_path}/users_map.json', 'w') as f:
    json.dump(users_map, f)
    
with open(f'{data_path}/items_map.json', 'w') as f:
    json.dump(items_map, f)
    
# d = np.load('users_text_vectors.npy', allow_pickle=True)

## Create ColdEvaluation Dataset

In [74]:

def crate_coldeval_test(train_table, test_table):
    train_interactions_count = (train_table > 0).sum(axis=1)
    rows_mask = (train_interactions_count > np.median(train_interactions_count)) # rows_mask = True on TOP 50% POPULAR ITEMS
    test_table[rows_mask] = np.zeros(test_table.shape[1])
    return test_table

users_ratings_table_test = crate_coldeval_test(users_ratings_table_train, users_ratings_table_test)
items_ratings_table_test = crate_coldeval_test(items_ratings_table_train, items_ratings_table_test)

np.save(f'{data_path}/users_ratings_table_cold_test.npy', users_ratings_table_test)
np.save(f'{data_path}/items_ratings_table_cold_test.npy', items_ratings_table_test)


# Popular based expereiment

Guess Rating_prediction based on each item mean rating in train (un personalized)

In [75]:
def run_experiment(items_table_train, users_table_test):
    items_baseline_preds = []

    for item_row in  items_table_train:
        items_baseline_preds.append(item_row[np.nonzero(item_row)[0]].mean())
    items_baseline_preds = np.array(items_baseline_preds)

    items_baseline_preds
    all_y_true = []
    all_y_pred = []

    for user_row in users_table_test:
        test_indeces = np.nonzero(user_row)
        if test_indeces:
            for test_index in test_indeces[0]: 
                y_true = user_row[test_index]
                y_pred = items_baseline_preds[test_index]
                all_y_true.append(y_true)
                all_y_pred.append(y_pred)

    all_y_pred = np.array(all_y_pred)
    all_y_true = np.array(all_y_true)


    rmse = np.sqrt(((all_y_pred - all_y_true) ** 2).mean())
    mae = np.abs(all_y_pred - all_y_true).mean()

    print(f'{data_to_preprocess} Data Experiment')
    print('RMSE Popularity Baseline Expr: ', rmse)
    print('MAE Popularity Baseline Expr: ', mae)

### Baseline Results

In [81]:
data_path = 'data/amazon_video_games'

print('\nAll Baseline Results:')

items_table_train = np.load(f'{data_path}/items_ratings_table_train.npy', allow_pickle=True)

users_table_test = np.load(f'{data_path}/users_ratings_table_test.npy', allow_pickle=True)
print('\n Amazon_video_games Experiment Regular')
run_experiment(items_table_train, users_table_test)

users_table_test = np.load(f'{data_path}/users_ratings_table_cold_test.npy', allow_pickle=True)
print('\n\n Amazon_video_games Experiment ColdEvaluation')
run_experiment(items_table_train, users_table_test)

data_path = 'data/ml-1m'
items_table_train = np.load(f'{data_path}/items_ratings_table_train.npy', allow_pickle=True)

users_table_test = np.load(f'{data_path}/users_ratings_table_test.npy', allow_pickle=True)
print('\n\\n Movielens 1m Experiment Regular')
run_experiment(items_table_train, users_table_test)

users_table_test = np.load(f'{data_path}/users_ratings_table_cold_test.npy', allow_pickle=True)
print('\n\n Movielens 1m Experiment ColdEvaluation')
run_experiment(items_table_train, users_table_test)


All Baseline Results:

 Amazon_video_games Experiment Regular
movielens1m Data Experiment
RMSE Popularity Baseline Expr:  1.0452992816384392
MAE Popularity Baseline Expr:  0.8093617782200861


 Amazon_video_games Experiment ColdEvaluation
movielens1m Data Experiment
RMSE Popularity Baseline Expr:  1.054362034428959
MAE Popularity Baseline Expr:  0.8199518171271164

\n Movielens 1m Experiment Regular
movielens1m Data Experiment
RMSE Popularity Baseline Expr:  0.9744565539385563
MAE Popularity Baseline Expr:  0.7782834928034735


 Movielens 1m Experiment ColdEvaluation
movielens1m Data Experiment
RMSE Popularity Baseline Expr:  1.0046994940246092
MAE Popularity Baseline Expr:  0.8052622831873744
