In [6]:
import pyspark as ps
import numpy as np
import pyspark.sql.functions as F

In [3]:
spark = (
    ps.sql.SparkSession.builder
    # .master("local[8]")
    .appName("LatentFeatureExploration")
    .getOrCreate()
)

In [4]:
# Load restaurant metadata
restaurants_df = spark.read.parquet('../data/restaurants')

# Load restaurant discount factors
discount_factor_df = spark.read.parquet('../data/discount_factor')

# Load restaurant ids into mapping dataframe
restaurant_id_map = []
index = 0
with open('../data/product_labels.txt') as f:
    for line in f:
        restaurant_id = line.strip()
        restaurant_id_map.append((restaurant_id, index))
        index += 1

restaurant_id_map_df = spark.createDataFrame(restaurant_id_map, ['id', 'item'])

restaurants_with_id_df = restaurants_df.join(restaurant_id_map_df, on='id')

# Load avg_rating that was calculated from model's training data
avg_rating_df = spark.read.parquet('../data/avg_rating')

# Load item_bias that was calculated from model's training data
item_bias_df = spark.read.parquet('../data/item_bias')

# Load item_factors that generated by model from training data
item_factors_df = spark.read.parquet('../data/item_factors')

In [7]:
def get_item_factors_and_ids():
    item_factors = []
    item_ids = []
    for row in item_factors_df.collect():
        item_factors.append(row['features'])
        item_ids.append(row['id'])
    item_factors = np.array(item_factors)
    item_ids = np.array(item_ids)
    return item_factors, item_ids


# Setup item_factors and item_ids. Static values so need to load only once.
item_factors, item_ids = get_item_factors_and_ids()

In [26]:
print(item_factors.shape)
latent_feature_min_indices = np.argmin(item_factors, axis=0)
latent_feature_max_indices = np.argmax(item_factors, axis=0)
print(latent_feature_min_indices)
print(latent_feature_max_indices)
print(np.mean(item_factors, axis=0))
print(np.std(item_factors, axis=0))
restaurants_with_id_df.filter(F.col('item'))

(5063, 76)
[2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507
 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507
 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507
 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507
 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507 2507
 2507]
[5052 5052 5052 5052 5052 5052 5052 5052 5052 5052 2523 5052 5052 5052 2523
 2523 2523 5052 5052 2523 2523 5052 5052 5052 5052 5052 5052 5052 5052 5052
 5052 5052 2523 5052 5052 5052 5052 2523 5052 2523 5052 2523 5052 5052 5052
 5052 5052 5052 5052 5052 5052 5052 5052 5052 5052 2523 5052 5052 5052 5052
 5052 5052 5052 5052 2523 5052 5052 2523 2523 5052 2523 5052 5052 2523 5052
 5052]
[ 0.23230369  0.1813703   0.2409327   0.22594382  0.23874373  0.22225988
  0.22790824  0.24041144  0.26794456  0.2626804   0.24423099  0.18504971
  0.26034253  0.18649004  0.23455758  0.1732272   0.21123273  0.27664