## 5. Experiment: From Word Embeddings to Paper Recommendation
Done by Kourosh Tajahmadi (kt77) and Marek Schuster (ms2228)  

In [1]:
# Start Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, count, desc, col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
import matplotlib.pyplot as plt
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import udf, explode, length, col
from pyspark.sql.types import ArrayType, StringType
from nltk.stem import PorterStemmer
import nltk
from pyspark.sql.functions import concat_ws
from pyspark.ml.feature import RegexTokenizer
from pyspark.sql.functions import expr
from pyspark.sql.functions import col, lit, rand
from pyspark.sql.types import StringType
from pyspark.sql.functions import when
from pyspark.ml.feature import Word2Vec, Word2VecModel
nltk.download('stopwords')

# Start Spark Session
spark = SparkSession.builder \
    .appName('Text Processing with Spark')\
    .config("spark.driver.memory", "6g") \
    .getOrCreate()

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read in the data

# Define the schema for papers
papers_schema = StructType([
    StructField("paper_id", IntegerType()),
    StructField("type", StringType()),
    StructField("journal", StringType()),
    StructField("book_title", StringType()),
    StructField("series", StringType()),
    StructField("publisher", StringType()),
    StructField("pages", StringType()),
    StructField("volume", StringType()),
    StructField("number", StringType()),
    StructField("year", StringType()),
    StructField("month", StringType()),
    StructField("postedat", StringType()),
    StructField("address", StringType()),
    StructField("title", StringType()),
    StructField("abstract", StringType())
])

# Load Data
papers_df = spark.read.csv('papers.csv', schema=papers_schema)
papers_df = papers_df.drop("type", "journal", "book_title", "series", "publisher", "pages", "volume", "number", "year", "month", "postedat", "address")
users_libraries_df = spark.read.text('users_libraries.txt')

# Split users_libraries into two columns (user_hash_id and user_library)
users_libraries_df = users_libraries_df.select(split(users_libraries_df.value, ";").alias("split_values"))
users_libraries_df = users_libraries_df.select(users_libraries_df.split_values[0].alias("user_hash_id"), users_libraries_df.split_values[1].alias("user_library"))

# Convert the user_library from a comma-separated string into an array of paper ids.
users_libraries_df = users_libraries_df.withColumn("user_library", split(col("user_library"), ",\s*").cast(ArrayType(IntegerType())))

# Explode the user_library into a new row for each paper
users_libraries_df = users_libraries_df.withColumn("paper_id", explode(col("user_library")))

# drop the user_library column
users_libraries_df = users_libraries_df.drop("user_library")



In [3]:
#print length of papers_df
print("Number of papers: ", papers_df.count())
# discard 1/100 of the data to make it easier to work with
#papers_df = papers_df.sample(False, 0.01, 42)
#print length of papers_df
print("Number of papers: ", papers_df.count())

Number of papers:  172079
Number of papers:  172079


## Exercise 5. 1 (Pre-processing Text for word2vec)

In [4]:

#a)
# concatenate title and abstract
papers_df = papers_df.withColumn('text', concat_ws(' ', papers_df.title, papers_df.abstract))

# tokenize text
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words_raw", pattern="[a-zA-Z-_]+", gaps=False)
papers_df = regexTokenizer.transform(papers_df)
# replace '-' and '_' in words
remove_chars_udf = udf(lambda words: [word.replace('-', '').replace('_', '') for word in words], ArrayType(StringType()))
papers_df = papers_df.withColumn("words", remove_chars_udf(col("words_raw")))
papers_df = papers_df.drop("title", "abstract", "text", "words_raw")

# filter words with length < 3
papers_df_cp = papers_df.withColumn("words", expr("filter(words, x -> length(x) >= 3)"))
#b)
# remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="words_no_stopw")
papers_df = remover.transform(papers_df)
#c)
# stem words
stemmer = PorterStemmer()
stemmer_udf = udf(lambda words: [stemmer.stem(word) for word in words], ArrayType(StringType()))
papers_df_ip = papers_df.withColumn("terms_raw", stemmer_udf(col("words_no_stopw")))
papers_df_ip = papers_df_ip.drop("words", "words_no_stopw")
papers_df_ip = papers_df_ip.withColumnRenamed("terms_raw", "words")


# get total number of papers
total_papers = papers_df.count()

papers_df_cp.show()
papers_df_ip.show()
papers_df_ip.printSchema()

+--------+--------------------+
|paper_id|               words|
+--------+--------------------+
|   80546|[the, arbitrarine...|
| 5842862|[how, choose, goo...|
| 1242600|[how, write, cons...|
| 3467077|[defrosting, the,...|
|  309395|[why, most, publi...|
|  305755|[the, structure, ...|
| 6603134|[how, build, moti...|
|      99|[collective, dyna...|
|  105595|[linked, how, eve...|
|  212874|[gene, ontology, ...|
|  740681|[usage, patterns,...|
|     101|[network, motifs,...|
|   99857|[the, strength, w...|
| 3614773|[rna, seq, revolu...|
|  873540|[pattern, recogni...|
| 6434100|[quick, guide, fo...|
|  100088|[basic, local, al...|
| 1387765|[powerlaw, distri...|
|  161814|[the, elements, s...|
|  117535|[maximum, likelih...|
+--------+--------------------+
only showing top 20 rows

+--------+--------------------+
|paper_id|               words|
+--------+--------------------+
|   80546|[arbitrari, genet...|
| 5842862|[choos, good, sci...|
| 1242600|[write, consist, ...|
| 3467077|[def

In [5]:
# Initialize word2Vec
w2v_cp = Word2Vec(vectorSize=100, inputCol="words", outputCol="w2v_cp")
w2v_ip = Word2Vec(vectorSize=100, inputCol="words", outputCol="w2v_ip")

In [37]:
# Train models
model_cp = w2v_cp.fit(papers_df_cp)

In [38]:
model_ip = w2v_ip.fit(papers_df_ip)

In [46]:
model_cp.write().overwrite().save("trainedmodels/cp")
model_ip.write().overwrite().save("trainedmodels/ip")

In [6]:
model_cp = Word2VecModel.load('trainedmodels/cp')    
model_ip = Word2VecModel.load('trainedmodels/ip')    


In [7]:
from pyspark.sql.functions import format_number as fmt
# Use build in function of the Model to find most similar synonyms
model_cp.findSynonyms("science", 10).select("word", fmt("similarity", 5).alias("similarity")).show()
model_ip.findSynonyms(stemmer.stem("science"), 10).select("word", fmt("similarity", 5).alias("similarity")).show()

+--------------+----------+
|          word|similarity|
+--------------+----------+
|  anthropology|   0.83369|
|   primatology|   0.83268|
|   mathematics|   0.81499|
|     sociology|   0.81020|
|          arts|   0.80245|
|        majors|   0.80226|
|    humanities|   0.79825|
|       fiction|   0.78621|
|historiography|   0.78338|
|    philosophy|   0.77431|
+--------------+----------+

+--------------+----------+
|          word|similarity|
+--------------+----------+
|     scientist|   0.76712|
|      sociolog|   0.74283|
|    primatolog|   0.73418|
|historiographi|   0.72510|
|     disciplin|   0.72313|
|      humanist|   0.70463|
|   anthropolog|   0.70277|
|      scientif|   0.70142|
|    philosophi|   0.68934|
|        anarch|   0.68238|
+--------------+----------+



### Analysis
Our results show that the consvervative pre-processing shows better results for finding the most similar words, which is quite unexpected. We tought that the stemming would lead to words being more similar to each other. Overall the results seem a bit off, but since the trainig portion of the exercise is very easy with little room for change, and also us not finding any errors in our pre-processing, we cannot point to the reason for this. 

## 5.2 Analogies

In [51]:
import re
import numpy as np

def analogy(word1, word2, word3, model, stemming=False):
    raw_words = [word1, word2, word3]
    words = []
    vectors = model.getVectors()
    words_as_vectors = []
    for i in raw_words:
        words.append([x.lower().strip() for x in re.split("[^A-Za-z]+", i)])
        
    if stemming:
        for i, wordlist in enumerate(words):
            for j, word in enumerate(wordlist):
                words[i][j] = stemmer.stem(word)
        
    for word in words:
        if len(word) > 1:
            tmp = spark.createDataFrame([[word]], ["words"])
            average = model.transform(tmp)
            words_as_vectors.append(average.head()[1])
        else:
            words_as_vectors.append(vectors.where(vectors.word==word[0]).head()[1])
        
    w = words_as_vectors[0] - words_as_vectors[1] + words_as_vectors[2]
    result = model.findSynonyms(w, 5)
    best = ""
    array = np.array(result.select("word").collect())

    for x in array:
        if x[0] not in [j for i in words for j in i]:
            best = x[0]
            break
    
    return best, result

In [53]:
word, df = analogy("machine learning", "predictions", "recommender systems", model_cp)
df.show()
print(word)
word, df = analogy("machine learning", "predictions", "recommender systems", model_ip, True)
df.show()
print(word)

+-------------+------------------+
|         word|        similarity|
+-------------+------------------+
|   hypermedia|0.7217651009559631|
|  intelligent|0.7086201310157776|
|      machine|0.6683545112609863|
|     tutoring| 0.661483645439148|
|collaborative|0.6565461754798889|
+-------------+------------------+

hypermedia
+------------+------------------+
|        word|        similarity|
+------------+------------------+
|      machin|  0.68124920129776|
|  overcommit|0.6691328883171082|
|studentcentr|0.6606923937797546|
|    groupwar|0.6340607404708862|
|        lmss|0.6320806741714478|
+------------+------------------+

overcommit


### Analysis

It is hard to evaluate which model performed better as both of them do not give intuitive results fitting our analogy.

We see that the requirment "Do not allow any of the words passed as arguments or the tokens of
which these are composed to be returned as w, i.e. w not in {machine, learning, predictions,recommender,systems}
in the previous example" is fulfilled, as the ip model actually predicts the word "machin" to be the best solution for the analogy,
but it is not returned as w, because it is already contained in "machine learning". 

## 5.3 From Embeddings to Paper Recommendation

In [8]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors, _convert_to_vector
from pyspark.sql.functions import udf, col
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import DenseVector, VectorUDT

def cosine_similarity(user_profiles_df, item_profiles_df, user_col='user_hash_id', user_profiles_col='user_profile', item_col='paper_id', item_profiles_col='embedding_profile'):
    """
    Compute cosine similarities between user profiles and item profiles.

    user_profiles_df: dataframe of user profiles with user_col and user_profiles_col
    item_profiles_df: dataframe of item profiles with item_col and item_profiles_col

    Returns: dataframe of user-item pairs and their cosine similarities
    """

    # A UDF to compute dot product of two vectors
    dot_product = udf(lambda v1, v2: float(v1.dot(v2)), DoubleType())
    # A UDF to compute the norm of a vector
    norm = udf(lambda v: float(v.norm(2)), DoubleType())

    # Add a new column to store vector norm
    user_profiles_df = user_profiles_df.withColumn(user_profiles_col+'_norm', norm(col(user_profiles_col)))
    item_profiles_df = item_profiles_df.withColumn(item_profiles_col+'_norm', norm(col(item_profiles_col)))

    # Cross join the dataframes to get all combinations of user-item pairs
    cross_df = user_profiles_df.crossJoin(item_profiles_df)

    # Compute dot product of the vectors
    cross_df = cross_df.withColumn('dot', dot_product(col(user_profiles_col), col(item_profiles_col)))

    # Compute cosine similarity
    cross_df = cross_df.withColumn('cosine_similarity', col('dot') / (col(user_profiles_col+'_norm') * col(item_profiles_col+'_norm')))

    # Select only necessary columns
    result = cross_df.select([user_col, item_col, 'cosine_similarity'])

    return result

In [9]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

def compute_profiles(user_df, paper_df, model):
    # Simply use build in transform function to calculate the paper_profile
    paper_profile = model.transform(paper_df)
    paper_profile = paper_profile.select("paper_id", col("w2v_cp").alias("embedding_profile"))
    
    # Join users with pre-processed paper_df to get dataframe of users and the list of words in papers they like
    # then apply map_reduce to make it just one row per user with one long list of words
    # Finally transform this reduced user | words Dataframe using the model to generate user_profile
    user_doc_df = user_df.join(paper_df, user_df.paper_id ==  paper_df.paper_id, "inner").drop("paper_id").orderBy("user_hash_id")
    
    user_doc_df_reduced = user_doc_df.rdd.map(lambda user_doc_df: (user_doc_df.user_hash_id, user_doc_df.words)).reduceByKey(lambda a, b: a+b).toDF()
    user_doc_df_reduced = user_doc_df_reduced.select(col("_1").alias("user_hash_id"), col("_2").alias("words"))
    user_profile = model.transform(user_doc_df_reduced)
    
    user_profile = user_profile.select(col("user_hash_id"), col("words"), col("w2v_cp").alias("user_profile"))
    
    return user_profile, paper_profile
    

def w2vRS(user_df, paper_df, model, k, user_id=None):
    
    user_profile, paper_profile = compute_profiles(user_df, paper_df, model)
    
    # Use function from excercise 3 to calculate cosine_similarity
    cs_df = cosine_similarity(user_profile, paper_profile)
    
    # If a User was specified remoce all other users from the Dataframe
    if(user_id != None):
        cs_df = cs_df.filter(cs_df.user_hash_id == user_id)
    
    # use window function to sort the Dataframe and put a column that indicates the rows rank    
    window = Window.partitionBy("user_hash_id").orderBy(col("cosine_similarity").desc())
    
    cs_df=cs_df.withColumn("top_k",row_number().over(window))
    
    # use that rank to only give top-k recommendations
    cs_df = cs_df.filter(col("top_k") <= k)
    
    return cs_df

In [None]:
result = w2vRS(users_libraries_df, papers_df_cp, model_cp, 10, user_id="1eac022a97d683eace8815545ce3153f")
result.show()

## 5.4 (Evaluation of Recommender System

In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

from pyspark.sql.functions import rank, col
from pyspark.sql.functions import udf, array, lit
from pyspark.sql.types import ArrayType, IntegerType

from pyspark.sql.functions import slice
import math
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import avg


# UDF for computing precision
def precision_at_k(test_set, top_k_recommendations):
    num_hits = len(set(top_k_recommendations).intersection(set(test_set)))
    return float(num_hits) / len(top_k_recommendations)
# UDF for computing recall
def recall_at_k(test_set, top_k_recommendations):
    num_hits = len(set(top_k_recommendations).intersection(set(test_set)))
    return float(num_hits) / len(test_set)
# UDF for computing MRR
def mrr_at_k(test_set, top_k_recommendations):
    mrr_value = 0.0
    for i, rec in enumerate(top_k_recommendations):
        if rec in test_set:
            mrr_value = 1.0 / (i + 1)
            break
    return mrr_value
# UDF for computing NDCG
def ndcg_at_k(test_set, top_k_recommendations):
    dcg = 0
    idcg = 0
    for i, rec in enumerate(top_k_recommendations):
        if rec in test_set:
            dcg += 1 / math.log2(i + 2)  # i + 2 because log index starts at 1, not 0 and the positions are 1-based
    sorted_test_set = sorted(test_set, reverse=True)
    for i in range(min(len(sorted_test_set), len(top_k_recommendations))):
        idcg += 1 / math.log2(i + 2)
    return dcg / idcg if idcg > 0 else 0

precision_udf = udf(precision_at_k, FloatType())
recall_udf = udf(recall_at_k, FloatType())
mrr_udf = udf(mrr_at_k, FloatType())
ndcg_udf = udf(ndcg_at_k, DoubleType())





def compute_metrics(df, k):

    # Slice the top k recommendations from the recommendations list and compute evaluation metrics
    df = df.withColumn('top_k_recommendations', slice(df['recommendations'], 1, k))
    df = df.withColumn('precision_at_k', precision_udf(df['test_set'], df['top_k_recommendations']))
    df = df.withColumn('recall_at_k', recall_udf(df['test_set'], df['top_k_recommendations']))
    df = df.withColumn('mrr_at_k', mrr_udf(df['test_set'], df['top_k_recommendations']))
    df = df.withColumn('ndcg_at_k', ndcg_udf(df['test_set'], df['top_k_recommendations']))

    # Compute average precision@k, recall@k, and MRR@k
    avg_precision_at_k = df.select(avg(df['precision_at_k'])).collect()[0][0]
    avg_recall_at_k = df.select(avg(df['recall_at_k'])).collect()[0][0]
    avg_mrr_at_k = df.select(avg(df['mrr_at_k'])).collect()[0][0]
    avg_ndcg_at_k = df.agg(avg(col('ndcg_at_k'))).collect()[0][0]


    # Change this line to check for zeros instead of null
    proportion_zero_mrr = df.where(df['mrr_at_k'] == 0).count() / df.count()

    print(f"Average Precision@{k}: {avg_precision_at_k}, Average Recall@{k}: {avg_recall_at_k}, Average MRR@{k}: {avg_mrr_at_k}, Proportion of MRR zero@{k}: {proportion_zero_mrr},  Average NDCG@{k}: {avg_ndcg_at_k}")

    return df


In [14]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rand, row_number

def sample_and_split_per_user(users_libraries_df, paper_df, n=50, min_items=5):
    """
    Randomly select n users with at least 5 items in their libraries, split their libraries into training and test sets per user, and compute their profiles.

    users_libraries_df: dataframe of users and their libraries
    paper_df: dataframe of papers and their words
    n: number of users to sample
    min_items: minimum number of items in a user's library

    Returns:
        training_df: a dataframe containing the training set papers for each user
        test_df: a dataframe containing the test set papers for each user
    """
    # Count the number of items in each user's library
    user_counts = users_libraries_df.groupBy('user_hash_id').count()

    # Select users with at least min_items(default 10) items
    # This extra step is to ensure that users have enough items to split into training and test sets
    users_with_5_items = user_counts.filter(col('count') >= min_items)

    # Randomly select n distinct users from this filtered group
    sampled_users = users_with_5_items.orderBy(rand()).limit(n)

    # Join with the original dataframe to get the entries for these users only
    sampled_users_df = sampled_users.join(users_libraries_df, "user_hash_id")

    # Add a column of random numbers and row number per user
    window = Window.partitionBy('user_hash_id').orderBy(rand())
    sampled_users_df = sampled_users_df.withColumn('rand', rand()).withColumn('row_num', row_number().over(window))

    # Get the maximum row number per user
    max_row_num_df = sampled_users_df.groupBy('user_hash_id').max('row_num')
    sampled_users_df = sampled_users_df.join(max_row_num_df, "user_hash_id")

    # Split into training and test dataframes per user based on the random number
    training_df = sampled_users_df.filter(col('row_num') <= 0.8 * col('max(row_num)')).drop('rand', 'row_num', 'max(row_num)')
    test_df = sampled_users_df.filter(col('row_num') > 0.8 * col('max(row_num)')).drop('rand', 'row_num', 'max(row_num)')

    # drop count column from training_df and test_df
    training_df = training_df.drop("count")
    test_df = test_df.drop("count")

    # Compute user profiles based on the training sets
    #training_profiles, paper_profiles = compute_profiles(training_df, paper_df, model)

    return training_df, test_df #, training_profiles, paper_profiles

# Call the function
training_df, test_df = sample_and_split_per_user(users_libraries_df, papers_df_cp, min_items=0)


In [15]:
result_training = w2vRS(training_df, papers_df_cp, model_cp, 30)
result_training.show()

+--------------------+--------+------------------+-----+
|        user_hash_id|paper_id| cosine_similarity|top_k|
+--------------------+--------+------------------+-----+
|1f29a9b52672539eb...| 6854433|0.9917092578839937|    1|
|1f29a9b52672539eb...| 7068502|0.9915973161344459|    2|
|1f29a9b52672539eb...| 8751904|0.9060308507322081|    3|
|1f29a9b52672539eb...|  383497| 0.904558015629168|    4|
|1f29a9b52672539eb...| 3340317|0.8979698292514435|    5|
|1f29a9b52672539eb...|  937992|0.8957776542543118|    6|
|1f29a9b52672539eb...| 3861668|0.8934774408064308|    7|
|1f29a9b52672539eb...| 6951763|0.8923846608547338|    8|
|1f29a9b52672539eb...| 3047357|0.8917680724998116|    9|
|1f29a9b52672539eb...|  161261|0.8914947652213865|   10|
|1f29a9b52672539eb...|  437141|0.8909900134007812|   11|
|1f29a9b52672539eb...| 2968653|0.8908766044932881|   12|
|1f29a9b52672539eb...|10540334|0.8907382525528902|   13|
|1f29a9b52672539eb...|  431014|0.8868803641484992|   14|
|1f29a9b52672539eb...|  910304|

In [16]:
from pyspark.sql.functions import collect_list
 
result_training_reduced = result_training.groupBy("user_hash_id").agg(collect_list("paper_id").alias("recommendations"))
test_set_df = test_df.groupBy("user_hash_id").agg(collect_list("paper_id").alias("test_set"))

In [17]:
final_df = result_training_reduced.join(test_set_df, ("user_hash_id"))
final_df.show()

+--------------------+--------------------+--------------------+
|        user_hash_id|     recommendations|            test_set|
+--------------------+--------------------+--------------------+
|1f29a9b52672539eb...|[6854433, 7068502...|           [4401985]|
|23191d8c18ff3c36e...|[9452968, 6960437...|[919241, 1279901,...|
|5915bcf11c8221ff7...|[423554, 5983055,...|    [366651, 405057]|
|5e1cec216680e931b...|[828974, 8266941,...|[141092, 1082836,...|
|79c45160c1c27a72c...|[1295217, 1255620...|[6829534, 6829487...|
|a2154466ea49dbffd...|[3467770, 1144333...|[637513, 782670, ...|
|a6ec425f49ccda07d...|[586862, 1079175,...|             [56516]|
|d3ed55d6afe5effc4...|[8977504, 555091,...|            [347169]|
|d6fa3fd987a1e0103...|[3365601, 6641683...|           [6641668]|
|df130214ee66bba5e...|[1368783, 2362311...|[5842862, 2491624...|
|fad932e21bee6a37b...|[334659, 4781936,...|    [167212, 583887]|
|0baa797ec9383cf73...|[2431278, 8632966...|[6663788, 4166879...|
|2ecfc0306113183a6...|[40

In [16]:
from pyspark.sql.functions import avg
for k in [5, 10, 30]:
    print(f"Computing metrics for k = {k}")
    
    metrics_df = compute_metrics(final_df, k)

Computing metrics for k = 5
Average Precision@5: 0.013043478455232538, Average Recall@5: 0.006184407793309378, Average MRR@5: 0.02898550746233567, Proportion of MRR zero@5: 0.9565217391304348,  Average NDCG@5: 0.015025331090027207
Computing metrics for k = 10
Average Precision@10: 0.010869565379360447, Average Recall@10: 0.0206771618484155, Average MRR@10: 0.03333333361407985, Proportion of MRR zero@10: 0.9130434782608695,  Average NDCG@10: 0.018627673861905617
Computing metrics for k = 30


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 42806)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
Traceback (most recen

ConnectionRefusedError: [Errno 111] Connection refused

In [13]:
from pyspark.sql.functions import avg

# Repeat for min_items = 20
training_df, test_df = sample_and_split_per_user(users_libraries_df, papers_df_cp, min_items=20)
result_training = w2vRS(training_df, papers_df_cp, model_cp, 30)
result_training_reduced = result_training.groupBy("user_hash_id").agg(collect_list("paper_id").alias("recommendations"))
test_set_df = test_df.groupBy("user_hash_id").agg(collect_list("paper_id").alias("test_set"))
final_df = result_training_reduced.join(test_set_df, ("user_hash_id"))

for k in [5, 10, 30]:
    print(f"Computing metrics for k = {k}")
    
    metrics_df = compute_metrics(final_df, k)

Computing metrics for k = 5
Average Precision@5: 0.028000000417232513, Average Recall@5: 0.013291700817644596, Average MRR@5: 0.05300000011920929, Proportion of MRR zero@5: 0.86,  Average NDCG@5: 0.025075233491218932
Computing metrics for k = 10
Average Precision@10: 0.01600000023841858, Average Recall@10: 0.014125034175813199, Average MRR@10: 0.05500000014901161, Proportion of MRR zero@10: 0.84,  Average NDCG@10: 0.020526588763331563
Computing metrics for k = 30
Average Precision@30: 0.017333334237337114, Average Recall@30: 0.04153650015592575, Average MRR@30: 0.06411904819309712, Proportion of MRR zero@30: 0.68,  Average NDCG@30: 0.032365627257366095


### Results

min_items = 0

Computing metrics for k = 5
-> Average Precision@5: 0.013043478455232538, Average Recall@5: 0.006184407793309378, Average MRR@5: 0.02898550746233567, Proportion of MRR zero@5: 0.9565217391304348,  Average NDCG@5: 0.015025331090027207

Computing metrics for k = 10
-> Average Precision@10: 0.010869565379360447, Average Recall@10: 0.0206771618484155, Average MRR@10: 0.03333333361407985, Proportion of MRR zero@10: 0.9130434782608695,  Average NDCG@10: 0.018627673861905617

Computing metrics for k = 30
-> Average Precision@30: 0.008888889269696342, Average Recall@30: 0.08902003264261617, Average MRR@30: 0.06835978875557581, Proportion of MRR zero@30: 0.8,  Average NDCG@30: 0.06529070753763205

mit_items = 20

Computing metrics for k = 5
Average Precision@5: 0.028000000417232513, Average Recall@5: 0.013291700817644596, Average MRR@5: 0.05300000011920929, Proportion of MRR zero@5: 0.86,  Average NDCG@5: 0.025075233491218932

Computing metrics for k = 10
Average Precision@10: 0.01600000023841858, Average Recall@10: 0.014125034175813199, Average MRR@10: 0.05500000014901161, Proportion of MRR zero@10: 0.84,  Average NDCG@10: 0.020526588763331563

Computing metrics for k = 30
Average Precision@30: 0.017333334237337114, Average Recall@30: 0.04153650015592575, Average MRR@30: 0.06411904819309712, Proportion of MRR zero@30: 0.68,  Average NDCG@30: 0.032365627257366095



The NDCG increased noticably for the higher paper minimum, indicating that more meaningful recommendations where achieved this way as there were more of them already present in the test set.