In [None]:
import os
import json
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


In [None]:
## Choose Model Architecture to Extract Feautres

# architecture = 'U-CFN'
architecture = 'I-CFN'

## Choose Dataset to Extract Feautres

data_to_preprocess = 'amazon_video_games'
# data_to_preprocess = 'amazon_movies'
# data_to_preprocess = 'movielens1m'

In [None]:

# # # # # Amazon Video Games
if data_to_preprocess == 'amazon_video_games':
    ratings_df = pd.read_json('data/reviews_Video_Games.json', lines=True)
    data_path = 'data/amazon_video_games'
    user_column = 'reviewerID'
    item_column = 'asin'
    rating_column = 'overall'

# # # # # Amazon Tv & Movies
if data_to_preprocess == 'amazon_movies':
    ratings_df = pd.read_json('data/reviews_Movies_and_TV.json', lines=True)
    data_path = 'data/amazon_movies'
    user_column = 'reviewerID'
    item_column = 'asin'
    rating_column = 'overall'

# # # # # MovieLense 1m
if data_to_preprocess == 'movielens1m':
    ratings_df = pd.read_csv('data/ml-1m/ratings.dat', sep='::', names=['user', 'item', 'rating', 'time'])
    data_path = 'data/ml-1m'
    user_column = 'user'
    item_column = 'item'
    rating_column = 'rating'

In [None]:
#Load Data

with open(f'{data_path}/users_map.json', 'r') as f:
    users_map = json.load(f)
    
with open(f'{data_path}/items_map.json', 'r') as f:
    items_map = json.load(f)

In [None]:

# # # # Amazon data
if data_to_preprocess == 'amazon_video_games' or data_to_preprocess == 'amazon_movies': 
    u_ratings_df = pd.read_csv(f'{data_path}/u_train_df.csv')
    i_ratings_df = pd.read_csv(f'{data_path}/i_train_df.csv')
    u_ratings_df = u_ratings_df.replace(np.nan, '', regex=True)
    i_ratings_df = i_ratings_df.replace(np.nan, '', regex=True)

# # # # # Movielens data
if data_to_preprocess == 'movielens1m':
    ratings_df = pd.read_csv('data/ml-1m/ratings.dat', sep='::', names=['user', 'item', 'rating', 'time'])
    movies_meta_df = pd.read_csv('data/ml-1m/movies.dat', sep='::', names=['item', 'movie_name', 'genre'])
    users_meta__df = pd.read_csv('data/ml-1m/users.dat', sep='::', names=['user', 'gender', 'age', 'occupation', 'zip-code'])

# 1. Amazon Data Features

### Aggregate reviews together for each user/item

In [None]:
if architecture == 'U-CFN':
    ratings_df = u_ratings_df
if architecture == 'I-CFN':
    ratings_df = i_ratings_df

In [None]:

users_reviews_text = {}
users_reviews_titles = {}

for user, user_df in ratings_df.groupby([user_column]):
    users_reviews_text[user] = '\n'.join(list(user_df['reviewText']))
    users_reviews_titles[user] = '\n'.join(list(user_df['summary']))

items_reviews_text = {}
items_reviews_titles = {}

for item, item_df in ratings_df.groupby([item_column]):
    items_reviews_text[item] = '\n'.join(list(item_df['reviewText']))
    items_reviews_titles[item] = '\n'.join(list(item_df['summary']))

### Create TF-IDF features from reviews

In [None]:
def create_tf_idf_vectors_from_dict(reviews_dict, index_map, dim):
    
    vectorizer = TfidfVectorizer()
    svd = TruncatedSVD(algorithm='randomized', n_components=dim)
    
    corpus = list(reviews_dict.values())
    tf_idf_features = vectorizer.fit_transform(corpus)
    tf_idf_features = svd.fit_transform(tf_idf_features)
    table_keys = [index_map[key] for key in reviews_dict.keys()]
    od = collections.OrderedDict(sorted(zip(table_keys, tf_idf_features)))
    return np.array(list(od.values()))
    

dim = 100

if architecture == 'U-CFN':
    users_text_vectors_small = create_tf_idf_vectors_from_dict(users_reviews_text, users_map, dim)
    users_titles_vectors_small = create_tf_idf_vectors_from_dict(users_reviews_titles, users_map, dim)
if architecture == 'I-CFN':
    items_text_vectors_small = create_tf_idf_vectors_from_dict(items_reviews_text, items_map, dim)
    items_titles_vectors_small = create_tf_idf_vectors_from_dict(items_reviews_titles, items_map, dim)

dim = 600

if architecture == 'U-CFN':
    items_text_vectors_big = create_tf_idf_vectors_from_dict(items_reviews_text, items_map, dim)
    items_titles_vectors_big = create_tf_idf_vectors_from_dict(items_reviews_titles, items_map, dim)
if architecture == 'I-CFN':
    users_text_vectors_big = create_tf_idf_vectors_from_dict(users_reviews_text, users_map, dim)
    users_titles_vectors_big = create_tf_idf_vectors_from_dict(users_reviews_titles, users_map, dim)


### Save features for model use

In [None]:
if architecture == 'U-CFN':
    
    np.save(f'{data_path}/users_text_vectors_small.npy',users_text_vectors_small)
    np.save(f'{data_path}/users_titles_vectors_small.npy',users_titles_vectors_small)
    
    np.save(f'{data_path}/items_text_vectors_big.npy',items_text_vectors_big)
    np.save(f'{data_path}/items_titles_vectors_big.npy',items_titles_vectors_big)
    
if architecture == 'I-CFN':

    np.save(f'{data_path}/items_text_vectors_small.npy',items_text_vectors_small)
    np.save(f'{data_path}/items_titles_vectors_small.npy',items_titles_vectors_small)

    np.save(f'{data_path}/users_text_vectors_big.npy',users_text_vectors_big)
    np.save(f'{data_path}/users_titles_vectors_big.npy',users_titles_vectors_big)


# 2. Movielens Data Features

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer


#Extract user features
user_feat_encoder = OneHotEncoder(handle_unknown='ignore')
user_feat_encoder.fit_transform(users_meta__df[['gender', 'age', 'occupation']])
user_features = pd.DataFrame(user_feat_encoder.fit_transform(users_meta__df[['gender', 'age', 'occupation']]).toarray())
user_features = np.array(user_features)
# user_feat_encoder.inverse_transform(user_features)


#Extract item features
mlb = MultiLabelBinarizer()

movies_meta_df['genre'] = movies_meta_df['genre'].apply(lambda s: s.split('|'))
transformed_genres = mlb.fit_transform(movies_meta_df['genre'])

items_features_dict = {}
for (_, row_df), genre_row in zip(movies_meta_df.iterrows(), transformed_genres):
    item_id = str(row_df['item'])
    if item_id in items_map:
        items_features_dict[items_map[item_id]] = genre_row

n_items = len(items_features_dict)
n_features = len(list(items_features_dict.values())[0])
items_features = np.zeros((n_items, n_features))
for row_idx, fv in items_features_dict.items():
    items_features[row_idx] = fv


### Save features for model use

In [None]:
np.save(f'{data_path}/users_features.npy',user_features)
np.save(f'{data_path}/items_features.npy',items_features)