# Preliminaries

Upload your kaggle API token. You can download it in the [kaggle account settings](https://www.kaggle.com/settings) under "Create new token"

In [None]:
from google.colab import files
files.upload()
!chmod 600 kaggle.json
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

Set to `n` to limit the dataset to the first `n` entries or set to `0` if you want to use the entire dataset.

In [None]:
limit_data = 100_000

# Setup

## Download dependencies

In [None]:
!pip install -q kaggle pyspark

In [None]:
import pyspark
import pyspark.sql.functions as SF
import pyspark.sql.window as SW
import pyspark.ml.feature as SFeat

## Download and load dataset

In [None]:
!kaggle datasets download -f yelp_academic_dataset_review.json yelp-dataset/yelp-dataset
!unzip -n yelp_academic_dataset_review.json.zip

In [None]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [None]:
reviews = spark.read.json("yelp_academic_dataset_review.json").select(["user_id", "business_id", "stars"])
if limit_data != 0: reviews = reviews.limit(limit_data)

In [None]:
reviews = reviews.withColumnRenamed("user_id", "user").withColumnRenamed("business_id", "business")
reviews.show()

In [None]:
# Convert IDs to serial integers
def serialize(df, col):
  return SFeat.StringIndexer(inputCol=col, outputCol=col+"_s").fit(df).transform(df).drop(col).withColumnRenamed(col+"_s",col)

serial = serialize(serialize(reviews, "user"), "business")
serial.show()

In [None]:
# normalization of column with respect to partition

def avgize(df, val_col, part_col):
  return df.groupBy(part_col).avg(val_col)

avgs = avgize(serial, "stars", "user")
avgs.show()

In [None]:
def normalize(df, avg_df, avg_col, val_col, part_col):
  return avgs.join(serial, part_col).withColumn(val_col+'_norm', SF.col(val_col)-SF.col(avg_col)).drop(avg_col).drop(val_col)

# normal = normalize(serial, avgs, "avg(stars)", "stars", "user_id").withColumnRenamed("stars_norm", "stars")
normal = serial # ignoring normalization for now
normal.show()

In [None]:
# building pairs for ultimately calculating cosine distance
def addColumnSuffix(df, suff):
  return df.select(*[SF.col(col_name).alias(col_name + suff) for col_name in df.columns])

def buildPairs(df, pair_col, group_col, val_col):
  return addColumnSuffix(df, "_1").join(addColumnSuffix(df, "_2"), (SF.col(pair_col+'_1')<SF.col(pair_col+'_2')) & (SF.col(group_col+"_1") == SF.col(group_col+"_2"))).withColumnRenamed(group_col+"_1", group_col).drop(group_col+"_2")

paired = buildPairs(normal, "user", "business", "stars").select(["user_1", "user_2", "business", "stars_1", "stars_2"])
paired.show()

In [None]:
# product of each pair of ratings for each pair of users
prod = paired.withColumn("prod", SF.col("stars_1")*SF.col("stars_2"))
prod.show()

In [None]:
# dot product between each pair of users
dot = prod.groupBy(["user_1","user_2"]).sum("prod").withColumnRenamed("sum(prod)", "dot")
dot.show()

In [None]:
# norm of the rating vector for each user
def getNorms(df, group_col, val_col):
  return df.withColumn("squared", SF.col(val_col)**2).groupBy(group_col).sum("squared").withColumn("norm", SF.col("sum(squared)")**.5).select([group_col, "norm"])

norm = getNorms(normal, "user", "stars")
norm.show()

In [None]:
# joining dots and norms to calculate cosine distance
dotnorm = dot.join(norm, SF.col("user_1") == SF.col("user")).withColumnRenamed("norm", "norm_1").drop("user").join(norm, SF.col("user_2") == SF.col("user")).withColumnRenamed("norm", "norm_2").drop("user")
cosine = dotnorm.withColumn("cosine", SF.col("dot")/(SF.col("norm_1")*SF.col("norm_2"))).select(["user_1", "user_2", "cosine"])
cosine.show()

In [None]:
def rankDistances(df, group_col, val_col):
  w = SW.Window.partitionBy(group_col).orderBy(val_col)
  return df.withColumn("rank", SF.row_number().over(w))

# adding symmetry to the distances
# adding rank of user_2 with respect to user_1
n = 100
ranked = rankDistances(cosine.union(cosine.select("user_2", "user_1", "cosine")), "user_1", "cosine").drop("cosine").filter(SF.col("rank")<=n)
ranked.show()

In [None]:
# trying to calculate a rating prediction with not much logic
# note that cosine is now useless, we only need rank

pred_stars = ranked.join(normal, SF.col("user_2")==SF.col("user")).groupBy(["user_1", "business"]).mean("stars").withColumnRenamed("avg(stars)", "stars_pred")

# COMMENTS: this kinda works but I'm calculating stuff for the ratings I already have.
pred_stars.show()