# This script finds the users and items that have the most ratings

In [31]:
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

spark = init_spark()

### Count ratings per user

In [32]:
df = spark.read.csv("../data/utility.csv", header=True)
user_rdd = df.rdd

In [33]:
def count_ratings_per_user(row):
    
    user_id = row.fullVisitorId
    user_ratings = [row[key] for key in row.__fields__ if key != "fuLlVisitorId" and row[key] is not None]

    count = 0
    if len(user_ratings) > 0:
        for r in user_ratings:
            count += 1
    else:
        count = None
        
    # Create a new Row with itemId and userAverage
    return (user_id, count)

In [None]:
user_counts = user_rdd.map(count_ratings_per_user).sortBy(lambda x: x[1], ascending=False)

for i in user_counts.collect():
    print(i)

### count ratings per item

In [None]:
df = spark.read.csv("../data/transposed_utility.csv", header=True)
user_rdd = df.rdd

In [None]:
def count_ratings_per_item(row):
    
    item_id = row.itemID
    item_ratings = [row[key] for key in row.__fields__ if key != "itemID" and row[key] is not None]

    count = 0
    if len(item_ratings) > 0:
        for r in item_ratings:
            count += 1
    else:
        count = None
        
    # Create a new Row with itemId and userAverage
    return (item_id, count)

In [30]:
item_counts = user_rdd.map(count_ratings_per_item).sortBy(lambda x: x[1], ascending=False)

for i in item_counts.collect():
    print(i)

('GGOEGBRA037499', 28)
('GGOEYFKQ020699', 28)
('GGOEGAAX0104', 24)
('GGOEGAAX0318', 22)
('GGOEGBRJ037399', 20)
('GGOEGAAX0338', 18)
('GGOEGDHQ015399', 18)
('GGOEGAAX0037', 16)
('GGOEGEVA022399', 16)
('GGOEGBJL013999', 14)
('GGOEGBRB013899', 14)
('GGOEGBRJ037299', 13)
('GGOEGFKQ020799', 13)
('GGOEGDHC074099', 12)
('GGOEGAAX0325', 11)
('GGOEGAAX0358', 11)
('GGOEGFKA022299', 11)
('GGOEYHPB072210', 11)
('GGOEGAAX0569', 10)
('GGOEGHPJ080310', 10)
('GGOEAOCB077499', 9)
('GGOEGOCB017499', 9)
('GGOEGOCC077299', 9)
('GGOEYOCR077799', 9)
('GGOEAFKQ020599', 8)
('GGOEAXXX0808', 8)
('GGOEGAAX0795', 8)
('GGOEGBJC014399', 8)
('GGOEGBMJ013399', 8)
('GGOEGAAX0330', 7)
('GGOEGAAX0351', 7)
('GGOEGFKQ020399', 7)
('GGOEYDHJ056099', 7)
('GGOEAKDH019899', 6)
('GGOEGAAX0074', 6)
('GGOEGAAX0290', 6)
('GGOEGAAX0341', 6)
('GGOEGAAX0596', 6)
('GGOEGAAX0661', 6)
('GGOEGAAX0680', 6)
('GGOEGBJC019999', 6)
('GGOEGBPB021199', 6)
('GGOEGESB015099', 6)
('GGOEGOLC013299', 6)
('GGOEGXXX0806', 6)
('GGOEACCQ017299', 5)
('GG