In [1]:
#Import all required packages
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.window import Window
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer,HashingTF
from pyspark.ml import Pipeline
import re
from nltk.stem import PorterStemmer
from pyspark.ml.linalg import Vectors
from pyspark.mllib.clustering import LDA
from pyspark.ml.feature import IDF

In [2]:
# Initialise Spark Session
spark = SparkSession.builder.appName("Experiment4")\
                            .config("spark.sql.broadcastTimeout", "36000")\
                            .config("spark.driver.memory", "20g")\
                            .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

21/08/06 14:52:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
#PREPARING DATAFRAMES FOR DATSETS

#Authors Dataframe
#df_authors = spark.read.csv("Datasets/authors.csv", sep = ",", header = True, quote = '"')

#PaperCsv dataframe
papersCsvSchema = StructType([
    StructField("paper_id",StringType(),False),
    StructField("type",StringType(),False),
    StructField("journal",StringType(),False),
    StructField("book_title",StringType(),False),
    StructField("series",StringType(),False),
    StructField("publisher",StringType(),False),
    StructField("pages",StringType(),False),
    StructField("volume",StringType(),False),
    StructField("number",StringType(),False),
    StructField("year",StringType(),False),
    StructField("month",StringType(),False),
    StructField("postedat",StringType(),False),
    StructField("address",StringType(),False),
    StructField("title",StringType(),False),
    StructField("abstract",StringType(),False),
])
df_paperCsv = spark.read.csv("Datasets/papers.csv", sep = ",", header = False, schema = papersCsvSchema, quote = '"')

#UserLibrary dataframe
userLibrarySchema = StructType([
    StructField("user_hash_id",StringType(),False),
    StructField("user_library",StringType(),False)
])
df_userLibrary = spark.read.csv("Datasets/users_libraries.txt", sep = ";", header = False, schema = userLibrarySchema)
df_userLibrary = df_userLibrary.selectExpr("user_hash_id","split(user_library,',') AS user_library")

In [4]:
#Cleaning and Tokenizing the data
def phraseTokenization(x):
    rawPhrase = x[13] + " " + x[14] #concatenating title and abstract
    rawPhrase = rawPhrase.replace("-","") #removing - from phrase
    rawPhrase = rawPhrase.replace("_","") #removing _ from phrase
    rawPhrase = rawPhrase.strip() #removing any trailing or leading whitespaces
    
    #spliting phrase based on non-alphaNumeric characters
    phraseArray = re.split('[^a-zA-Z0-9]+',rawPhrase) 
    
    #remove words with less than 3 char
    phraseArrayFilteredWords = [i for i in phraseArray if len(i) >= 3]
    
    return (x[0],list(phraseArrayFilteredWords))


df_tokenize = df_paperCsv.na.fill(value="").rdd.map(phraseTokenization).toDF()

#Removing StopWords using ML
swRemover = StopWordsRemover(inputCol="_2", outputCol="cleaned_terms")
df_cleanedData = swRemover.transform(df_tokenize)
df_cleanedData = df_cleanedData.selectExpr("_1 AS paper_id","cleaned_terms")

#Stemming using Porter stemmer Algo
ps =  PorterStemmer()

def stemmingTerms(x):
    stemmedWords = []
    for word in x:
        rootWord = ps.stem(word)
        stemmedWords.append(rootWord)
    return stemmedWords

df_cleanedData = df_cleanedData.rdd.mapValues(stemmingTerms).toDF().selectExpr("_1 AS paper_id","_2 AS cleaned_terms")

                                                                                

In [5]:
# Find the count of papers in which the term is present
df_paperCount = df_cleanedData.selectExpr("paper_id","explode(cleaned_terms) AS terms").distinct().groupBy("terms").count().withColumnRenamed("count", "paper_count")

#10 percent of total papers present in file
noOfDistinctPapers_df = int(df_cleanedData.select(countDistinct("paper_id")).collect()[0][0])
tenPercentOfTotalPapers = int(noOfDistinctPapers_df/10)

# remove words appear in more than 10% of the papers and keep only the words that appear in at least 20 papers 
df_filterdTerms = df_paperCount.filter((df_paperCount["paper_count"]<=tenPercentOfTotalPapers) & (df_paperCount["paper_count"]>=20))

#Fetch top 1000 terms 
top1000Terms = df_filterdTerms.orderBy(col("paper_count").desc()).limit(1000)

                                                                                

In [6]:
#associate unique integer values to each term
df_termsWithUniqueIndex = top1000Terms.withColumn("unique_index",row_number().over(Window.orderBy("paper_count"))).selectExpr("terms","unique_index-1 AS unique_index")

#Collect all terms in a list
terms_collection =  [row.terms for row in df_termsWithUniqueIndex.collect()]

# Generating Termfrequency Vector for each paper
df_cleanedDataExplode = df_cleanedData.selectExpr("paper_id","explode(cleaned_terms) AS terms")

#Getting the Unique_index of term 
df_cleanedDataJoinIndex = df_cleanedDataExplode.join(df_termsWithUniqueIndex,df_cleanedDataExplode.terms == df_termsWithUniqueIndex.terms , how = "inner").select(df_cleanedDataExplode.paper_id,df_cleanedDataExplode.terms,df_termsWithUniqueIndex.unique_index)

#Getting the term_frequency in each paper
df_cleanedDataJoinIndex = df_cleanedDataJoinIndex.groupBy("paper_id","unique_index").count().withColumnRenamed("count", "term_frquency")

#Creating a sparseVector respresentation for each paper
rdd_CleanedDataReducedByPaperId = df_cleanedDataJoinIndex.rdd.map(lambda x: (x[0], [(x[1], x[2])])).reduceByKey(lambda a, b: a + b)
rdd_CleanedDataReducedByPaperId = rdd_CleanedDataReducedByPaperId.map(lambda x: (x[0],Vectors.sparse(1000,x[1])))

df_CleanedDataSparseVector = rdd_CleanedDataReducedByPaperId.toDF().selectExpr("_1 AS paper_id","_2 AS term_frequency_vector")

21/08/06 14:52:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/08/06 14:53:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [7]:
df_CleanedDataSparseVector.take(1)

[Row(paper_id='498902', term_frequency_vector=SparseVector(1000, {33: 3.0, 47: 1.0, 79: 1.0, 97: 1.0, 138: 1.0, 170: 6.0, 354: 1.0, 368: 1.0, 394: 1.0, 482: 2.0, 491: 1.0, 541: 1.0, 550: 1.0, 566: 1.0, 581: 1.0, 596: 1.0, 622: 1.0, 632: 1.0, 663: 1.0, 670: 1.0, 720: 2.0, 723: 1.0, 762: 1.0, 764: 1.0, 773: 1.0, 797: 1.0, 820: 1.0, 826: 1.0, 837: 2.0, 843: 1.0, 879: 1.0, 881: 1.0, 890: 1.0, 892: 1.0, 894: 1.0, 928: 1.0, 937: 1.0, 946: 2.0, 949: 3.0, 952: 1.0, 962: 1.0, 965: 1.0, 984: 1.0, 985: 4.0, 990: 1.0, 992: 1.0}))]

In [8]:
#Ex 3.2

#TF-IDF Representation for each paper

idf = IDF(inputCol="term_frequency_vector", outputCol="tf_idf_vector")
tf_idf_model = idf.fit(df_CleanedDataSparseVector)
df_rescaledCleanedData = tf_idf_model.transform(df_CleanedDataSparseVector)
df_rescaledCleanedData = df_rescaledCleanedData.select("paper_id", "tf_idf_vector")

                                                                                

In [9]:
#TF-IDF Vector for papers
df_rescaledCleanedData.take(1)

[Row(paper_id='498902', tf_idf_vector=SparseVector(1000, {33: 13.8439, 47: 4.596, 79: 4.5494, 97: 4.5325, 138: 4.4866, 170: 26.6474, 354: 4.1522, 368: 4.1302, 394: 4.0846, 482: 7.8051, 491: 3.8889, 541: 3.7983, 550: 3.7834, 566: 3.7571, 581: 3.7152, 596: 3.6986, 622: 3.6484, 632: 3.6256, 663: 3.5658, 670: 3.5395, 720: 6.7871, 723: 3.3875, 762: 3.2929, 764: 3.2917, 773: 3.2717, 797: 3.2131, 820: 3.1488, 826: 3.1306, 837: 6.153, 843: 3.0651, 879: 2.9588, 881: 2.9516, 890: 2.9076, 892: 2.9043, 894: 2.8955, 928: 2.7393, 937: 2.6945, 946: 5.2459, 949: 7.8223, 952: 2.5913, 962: 2.5231, 965: 2.5148, 984: 2.3717, 985: 9.4584, 990: 2.328, 992: 2.3177}))]

In [10]:
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA

# Latent Direchlet Allocation

num_topics = 40

# Transform data into LDA supported format
df_lda_format = df_CleanedDataSparseVector.selectExpr("paper_id AS id","term_frequency_vector AS features")

lda = LDA(k=40)
lda_model = lda.fit(df_lda_format)
df_lda_paper_topic_model = lda_model.transform(df_lda_format)

21/08/06 14:54:50 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/08/06 14:54:50 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [11]:
#LDA Vector for Papers
df_lda_paper_topic_model.show()

+-------+--------------------+--------------------+
|     id|            features|   topicDistribution|
+-------+--------------------+--------------------+
| 498902|(1000,[33,47,79,9...|[3.90080137518724...|
| 201593|(1000,[38,90,104,...|[2.64231686691950...|
|1727709|(1000,[24,55,80,1...|[4.46831379992713...|
| 383220|(1000,[93,104,114...|[5.46153581161764...|
|  23055|(1000,[24,56,225,...|[3.96372963847816...|
|1287740|(1000,[79,238,260...|[5.12010493932825...|
|2090908|(1000,[177,280,44...|[3.07175741642788...|
|9106608|(1000,[8,27,70,89...|[0.13410774543216...|
|1857331|(1000,[61,62,169,...|[4.55107864190545...|
|2707871|(1000,[0,1,2,4,7,...|[2.07697730610264...|
|2798913|(1000,[1,55,56,65...|[3.96372963847816...|
| 460407|(1000,[45,101,139...|[2.58668215485906...|
| 423550|(1000,[32,54,232,...|[3.90080137518724...|
|6500865|(1000,[31,140,194...|[0.04692167365781...|
|2945717|(1000,[217,229,24...|[3.32085899926865...|
| 674581|(1000,[3,26,154,1...|[4.81885180608244...|
| 560037|(10

In [12]:
#User Profiling

#1)  produces a user profile for each user as the summation of the TF-IDF vectors of the papers that appear in the user’s library

df_userLibrary_explode = df_userLibrary.selectExpr("user_hash_id","explode(user_library) AS paper_id")
df_userJoined_TfIdf = df_userLibrary_explode.join(df_rescaledCleanedData,df_userLibrary_explode.paper_id == df_rescaledCleanedData.paper_id, how="inner").select(df_userLibrary_explode.user_hash_id,df_userLibrary_explode.paper_id,df_rescaledCleanedData.tf_idf_vector)

In [13]:
import collections

# Adding 2 sparse vectors
def addSparseVectors(v1, v2):
    values = collections.defaultdict(float) # Initialize Dictionary with default value 0.0
    
    # Add values from v1 SparseVector
    for i in range(v1.indices.size):
        values[v1.indices[i]] += v1.values[i]
    # Add values from v2 SParseVector
    for i in range(v2.indices.size):
        values[v2.indices[i]] += v2.values[i]
    return Vectors.sparse(v1.size, dict(values))

#final Df : summation of TF-IDF vector for each userlibrary
df_userprofile_tfidf = df_userJoined_TfIdf.selectExpr("user_hash_id","tf_idf_vector").rdd.reduceByKey(lambda x,y: addSparseVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_tf_idf_vector")

                                                                                

In [14]:
df_userprofile_tfidf.take(1)

                                                                                

[Row(user_hash_id='6931f7f79678cf72aae416ff7cb43bb1', sum_tf_idf_vector=SparseVector(1000, {0: 13.9931, 1: 23.287, 3: 9.3123, 4: 37.249, 5: 4.6555, 6: 4.6536, 7: 32.5708, 10: 9.2996, 11: 4.6492, 12: 23.2428, 13: 4.6467, 14: 41.8145, 18: 9.2734, 20: 4.633, 22: 4.6305, 23: 41.6692, 24: 4.6299, 26: 97.1893, 27: 9.2561, 28: 37.0147, 29: 4.6256, 31: 4.6238, 33: 4.6146, 34: 13.8384, 35: 4.6122, 36: 13.8293, 37: 23.0429, 38: 13.8257, 39: 9.2147, 41: 4.6044, 42: 9.2087, 47: 18.3839, 48: 9.192, 51: 13.7737, 52: 4.5889, 53: 4.5889, 55: 13.756, 56: 4.583, 57: 9.1601, 60: 13.7331, 62: 13.7243, 65: 4.5707, 67: 9.1298, 68: 4.5643, 70: 13.6809, 72: 173.1171, 74: 95.6219, 75: 36.4183, 76: 9.1034, 77: 4.5511, 78: 4.55, 79: 13.6483, 80: 9.0989, 81: 9.0977, 84: 4.5466, 86: 40.8787, 87: 18.166, 88: 4.5415, 89: 4.5409, 92: 27.2389, 93: 72.5921, 94: 31.7512, 95: 4.5353, 99: 18.1279, 100: 9.0628, 102: 9.0484, 104: 4.5236, 105: 9.045, 107: 9.0362, 110: 4.5153, 111: 13.5444, 113: 4.5148, 115: 9.023, 116: 9.023

In [15]:
#2  LDA-based profiles for each user as the summation of the paper-topics vectors of the papers 

df_userJoined_LDA = df_userLibrary_explode.join(df_lda_paper_topic_model,df_userLibrary_explode.paper_id == df_lda_paper_topic_model.id, how="inner").select(df_userLibrary_explode.user_hash_id,df_userLibrary_explode.paper_id,df_lda_paper_topic_model.topicDistribution)

In [16]:
#function to add 2 LDA dense vectors
def addLDAPaperTopicVectors(v1,v2):
    return (v1+v2)
    
df_userprofile_lda = df_userJoined_LDA.selectExpr("user_hash_id","topicDistribution").rdd.reduceByKey(lambda x,y: addLDAPaperTopicVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_lda_vector")

                                                                                

In [17]:
df_userprofile_lda.take(1)

[Row(user_hash_id='6931f7f79678cf72aae416ff7cb43bb1', sum_lda_vector=DenseVector([0.4248, 0.1204, 0.3837, 4.0139, 0.8215, 0.3547, 0.4801, 13.0501, 0.1229, 0.4406, 1.5565, 0.1311, 0.282, 0.8597, 0.1193, 0.3915, 4.2637, 0.5378, 0.1192, 0.5435, 1.6483, 0.1196, 0.1205, 4.7952, 0.1773, 0.3229, 0.169, 0.1186, 3.5519, 0.2494, 0.3181, 9.3645, 0.4519, 0.1689, 0.9008, 1.0454, 0.1279, 14.6455, 0.5297, 0.1576]))]

In [18]:
#Ex 4.2 Funtion to calculate cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

# UDF for calculating cosine similarity

#def cos_similarity(a,b):
#    cosSimValue = float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
#    return cosSimValue

def calculateCosineSimilarity(userVector,paperVector):
    cosineSimilarityValue = float(cosine_similarity([userVector],[paperVector])[0,0])
    return cosineSimilarityValue


In [19]:
#Ex 4.3 

#Broadcast variable containing master set of all the paperId

PaperIds = list(df_paperCsv.selectExpr("paper_id").toPandas()['paper_id'])
PaperIdsBroadcast = sc.broadcast(PaperIds)

In [20]:
import numpy as np

#Find Delta library for the user: Fetching the Papers excluding the papers which are already present in user Library
def findDeltaLibrary(userLib):
    deltaUserLibrary = np.setdiff1d(PaperIdsBroadcast.value,list(userLib),assume_unique=True).tolist()    
    return deltaUserLibrary

FindDeltaLibraryUDF = udf(findDeltaLibrary, ArrayType(StringType()))


In [21]:

#a)

#Function for Content Based Recommendation on TF-IDF User Profile
def tf_idf_CBRS(df_UserRecord,numberOfRecommendation):
    

    #Fetching the Papers along with their TF-IDF excluding the papers which are already present in user Library
    df_UserRecord_DeltaLibrary = df_UserRecord.withColumn('delta_UserLibrary', explode(FindDeltaLibraryUDF(df_UserRecord.user_library)))
    df_DeltaPapers = df_UserRecord_DeltaLibrary.join(df_rescaledCleanedData, df_UserRecord_DeltaLibrary.delta_UserLibrary == df_rescaledCleanedData.paper_id,how="inner")
    df_DeltaPapers = df_DeltaPapers.selectExpr("user_hash_id","sum_tf_idf_vector AS user_tf_idf_vector","paper_id","tf_idf_vector")
    
    #Repartioning the data
    df_DeltaPapers = df_DeltaPapers.repartition(40)
    
    #Calculate the Cosine Similarity
    df_cosine_similarity_result = df_DeltaPapers.rdd.map(lambda x: (x['user_hash_id'],x['paper_id'],calculateCosineSimilarity(x['user_tf_idf_vector'],x['tf_idf_vector']))).toDF(schema=['user_hash_id', 'paper_id', 'cosine_similarity_value'])
    
    #Repartioning the data
    df_cosine_similarity_result = df_cosine_similarity_result.repartition(40)
    
    #Fetch the Top K recommendations for the User
    windowFn = Window.partitionBy(df_cosine_similarity_result['user_hash_id']).orderBy(df_cosine_similarity_result['cosine_similarity_value'].desc())
     
    df_top_k_recommendations = df_cosine_similarity_result.select('*', row_number().over(windowFn).alias('row_number')).filter(col('row_number') <= numberOfRecommendation)
    df_top_k_recommendations = df_top_k_recommendations.groupBy("user_hash_id").agg(collect_list("paper_id").alias("recommended_library"))
    
    return df_top_k_recommendations

In [22]:
#Ex4.3 b)

#Function for Content Based Recommendation on LDA User Profile
def lda_CBRS(df_UserRecord,numberOfRecommendation):
    
    #Fetching the Papers along with their LDA excluding the papers which are already present in user Library
    df_UserRecord_DeltaLibrary = df_UserRecord.withColumn('delta_UserLibrary', explode(FindDeltaLibraryUDF(df_UserRecord.user_library)))
    df_DeltaPapers = df_UserRecord_DeltaLibrary.join(df_lda_paper_topic_model, df_UserRecord_DeltaLibrary.delta_UserLibrary == df_lda_paper_topic_model.id,how="inner")
    df_DeltaPapers = df_DeltaPapers.selectExpr("user_hash_id","sum_lda_vector AS user_lda_vector","id AS paper_id","topicDistribution")
    
    #Repartioning the data
    df_DeltaPapers = df_DeltaPapers.repartition(40)
    
    #Calculate the Cosine Similarity
    df_cosine_similarity_result = df_DeltaPapers.rdd.map(lambda x: (x['user_hash_id'],x['paper_id'],calculateCosineSimilarity(x['user_lda_vector'],x['topicDistribution']))).toDF(schema=['user_hash_id', 'paper_id', 'cosine_similarity_value'])
    
    #Repartioning the data
    df_cosine_similarity_result = df_cosine_similarity_result.repartition(40)
    
    #Fetch the Top K recommendations for the User
    windowFn = Window.partitionBy(df_cosine_similarity_result['user_hash_id']).orderBy(df_cosine_similarity_result['cosine_similarity_value'].desc())
    
    df_top_k_recommendations = df_cosine_similarity_result.select('*', row_number().over(windowFn).alias('row_number')).filter(col('row_number') <= numberOfRecommendation)
    df_top_k_recommendations = df_top_k_recommendations.groupBy("user_hash_id").agg(collect_list("paper_id").alias("recommended_library"))
    
    return df_top_k_recommendations

In [24]:
#Ex 4.3 c) Top K recommendations for User = 1eac022a97d683eace8815545ce3153f

user_hash_id = '1eac022a97d683eace8815545ce3153f'
k = 10 #No Of Recommendation

#Recommendation using TF-IDF Vector

#Fetch the records for the Particular User along with User Library
df_UserRecord = df_userLibrary.filter((df_userLibrary.user_hash_id.isin(user_hash_id)))
    
#Fetching the User TF-IDF 
df_UserRecord = df_UserRecord.join(df_userprofile_tfidf, df_UserRecord.user_hash_id == df_userprofile_tfidf.user_hash_id, how="inner").select(df_UserRecord.user_hash_id,df_userprofile_tfidf.sum_tf_idf_vector,df_UserRecord.user_library)

#inputs((userHashId,Vector,UserLibrary),NoOfRecommendation)
recommendation_with_tf_idf=tf_idf_CBRS(df_UserRecord,k)

                                                                                

In [25]:
#Top 10 Recommendation for User:1eac022a97d683eace8815545ce3153f with TF-IDF 
recommendation_with_tf_idf.show(truncate=False)

                                                                                

+--------------------------------+------------------------------------------------------------------------------------------+
|user_hash_id                    |recommended_library                                                                       |
+--------------------------------+------------------------------------------------------------------------------------------+
|1eac022a97d683eace8815545ce3153f|[854469, 2838456, 4209212, 11692301, 10100718, 6306064, 7326487, 2284574, 3374934, 940716]|
+--------------------------------+------------------------------------------------------------------------------------------+



In [26]:
## Recommendation using LDA vector

#Fetch the records for the Particular User along with User Library
df_UserRecord = df_userLibrary.filter((df_userLibrary.user_hash_id.isin(user_hash_id)))
    
#Fetching the User LDA Vector 
df_UserRecord = df_UserRecord.join(df_userprofile_lda, df_UserRecord.user_hash_id == df_userprofile_lda.user_hash_id, how="inner").select(df_UserRecord.user_hash_id,df_userprofile_lda.sum_lda_vector,df_UserRecord.user_library)

#inputs((userHashId,Vector,UserLibrary),NoOfRecommendation)
recommendation_with_lda=lda_CBRS(df_UserRecord,k)

                                                                                

In [27]:
#Top 10 Recommendation for User:1eac022a97d683eace8815545ce3153f with LDA
recommendation_with_lda.show(truncate=False)

                                                                                

+--------------------------------+----------------------------------------------------------------------------------------+
|user_hash_id                    |recommended_library                                                                     |
+--------------------------------+----------------------------------------------------------------------------------------+
|1eac022a97d683eace8815545ce3153f|[8657274, 1522926, 3398098, 6571828, 2822588, 479195, 6420819, 941016, 1119991, 7512960]|
+--------------------------------+----------------------------------------------------------------------------------------+



In [52]:
#Ex 4.4 Offline evaluation metrics

import pyspark.sql.functions as f
import numpy as np

#Find the common elements between two list: to find the Hit elements list
def hitElements(list1,list2):
    commonElements = list(set(list1).intersection(list2))
    return list(commonElements)

#df_UserMetricsEvaluation Schema --> user_hash_id, test_set_user_library, top_recommendations
def calculatePrecisionAndRecallMetrics(df_UserMetricsEvaluation,k):
    
    #If list of top-k recommendations is larger than k,then only the first k elements must be taken into account
    df_firstKElements = df_UserMetricsEvaluation.withColumn("new_top_recommendations",
                                                            when(size(col("top_recommendations")) > k,
                                                                 f.slice("top_recommendations",start=1,length=k))
                                                            .otherwise(col("top_recommendations")))
    
    #Find the hitElementsList and Count the Number of Hits
    HitElementsUDF = udf(hitElements, ArrayType(StringType()))
    
    df_HitElements = df_firstKElements.withColumn("HitsList",HitElementsUDF(df_firstKElements.new_top_recommendations,df_firstKElements.test_set_user_library))\
                     .withColumn("NoOfHits",size(col("HitsList")))
    
    #Calculate the precision for each user
    df_precision = df_HitElements.withColumn("Precision_Value",col("NoOfHits")/k)
    
    #Calculate the recall for each user
    df_recall = df_precision.withColumn("Recall_Value",col("NoOfHits")/when(size(col("test_set_user_library")) == 0,1)
                                                                        .otherwise(size(col("test_set_user_library"))))
    
    df_finalMetrics = df_recall.select("user_hash_id","test_set_user_library","top_recommendations","HitsList","NoOfHits","Precision_Value","Recall_Value")
  
    return df_finalMetrics


#Calculate the MRR value
def calculateMRR(recommendationList,HitList):
    
    mrrvalue = 0
    
    if(len(HitList) == 0):
        mrrvalue = 0    
    else:
        firstElement = HitList[0]
        position = recommendationList.index(firstElement)
        mrrvalue = 1/(position + 1)
        
    return mrrvalue


CalculateMRRValueUDF = udf(calculateMRR,IntegerType())

In [53]:
#Ex 3.5 
# 1: Randomly selects n users
fraction_of_users = 20/(df_userLibrary.count())
df_sampleUsers = df_userLibrary.sample(withReplacement=False, fraction=fraction_of_users)


                                                                                

In [54]:
#Ex3.5
#2 Divide sampled data into raining set and test set for each user
import math
import random

def divideList(masterList,trainingSetFraction):
    size_masterList = len(masterList)
    size_TrainingList = int(math.ceil(trainingSetFraction * size_masterList))
    trainingList = masterList[:size_TrainingList]
    testList = masterList[size_TrainingList:]
    return [trainingList,testList]

df_divide_train_test_data = df_sampleUsers.rdd.map(lambda x: (x[0],divideList(x[1],0.8))).toDF()
df_divide_train_test_data = df_divide_train_test_data.select(df_divide_train_test_data._1.alias("user_hash_id"),df_divide_train_test_data._2[0].alias("training_data"),df_divide_train_test_data._2[1].alias("test_data"))

df_training_data = df_divide_train_test_data.selectExpr("user_hash_id","training_data")
df_test_data = df_divide_train_test_data.selectExpr("user_hash_id","test_data")


In [55]:
df_training_data.show()

+--------------------+--------------------+
|        user_hash_id|       training_data|
+--------------------+--------------------+
|f1deb990e614dffe0...|[5757765, 5779592...|
|7ff050e310c1db571...|[4455563, 224759,...|
|d7241265f5f2d4cd2...|[2343426, 7512185...|
|655c3f173b8a1d5da...|[130259, 7678720,...|
|2ca707af72c903dd4...|[1536685, 2307614...|
|9482bb4562752e454...|[674981, 636454, ...|
|a8971b1116533ad63...|[4902940, 1352628...|
|3d453dce2e573c600...|    [853225, 430859]|
|891432aee06ca5ab7...|[3733130, 3955633...|
|2d8b61d140a0cdeed...|[8429310, 1048153...|
|d5da87296aed37b4f...|[581011, 575400, ...|
|fb2439610b1596017...|  [4214149, 4732255]|
|e42e6dab1272a01f5...|           [4671470]|
|0d6ddad207217d342...|           [4238450]|
|4a85756cb50fbc7a5...|[12610321, 407427...|
|5396f8a35dd2076cd...|[595771, 4375813,...|
|d4df4f686c00cd6d7...|[580611, 273980, ...|
|b62fda7558c5a0854...|  [3015954, 3969416]|
|f486e0b0fb042bbdb...|[335609, 335719, ...|
|2482a7a5168747b30...|          

In [56]:
df_test_data.show()

+--------------------+--------------------+
|        user_hash_id|           test_data|
+--------------------+--------------------+
|f1deb990e614dffe0...|[10707638, 419414...|
|7ff050e310c1db571...|[2729747, 1123254...|
|d7241265f5f2d4cd2...|           [4441409]|
|655c3f173b8a1d5da...|[438096, 835275, ...|
|2ca707af72c903dd4...|           [3042410]|
|9482bb4562752e454...|   [106086, 3750229]|
|a8971b1116533ad63...|[14077833, 13498735]|
|3d453dce2e573c600...|                  []|
|891432aee06ca5ab7...|                  []|
|2d8b61d140a0cdeed...|   [6647231, 103668]|
|d5da87296aed37b4f...|[1167801, 873540,...|
|fb2439610b1596017...|                  []|
|e42e6dab1272a01f5...|                  []|
|0d6ddad207217d342...|                  []|
|4a85756cb50fbc7a5...|[12016840, 225066...|
|5396f8a35dd2076cd...|[5455141, 3140261...|
|d4df4f686c00cd6d7...|                  []|
|b62fda7558c5a0854...|                  []|
|f486e0b0fb042bbdb...|            [335421]|
|2482a7a5168747b30...|          

In [57]:
#Ex 3.5

#Creating User profiles for Training data set
df_training_data_explode = df_training_data.selectExpr("user_hash_id","explode(training_data) AS training_data_paper_id")

#a- user profile using sampled users data over Tf-Idf vector

df_training_data_TfIdf = df_training_data_explode.join(df_rescaledCleanedData,df_training_data_explode.training_data_paper_id == df_rescaledCleanedData.paper_id, how="inner").select(df_training_data_explode.user_hash_id,df_training_data_explode.training_data_paper_id,df_rescaledCleanedData.tf_idf_vector)
df_training_data_userprofile_tfidf = df_training_data_TfIdf.selectExpr("user_hash_id","tf_idf_vector").rdd.reduceByKey(lambda x,y: addSparseVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_tf_idf_vector")

                                                                                

In [58]:
df_training_data_userprofile_tfidf.take(1)

[Row(user_hash_id='891432aee06ca5ab755f90fbdce0fd6a', sum_tf_idf_vector=SparseVector(1000, {24: 4.6299, 82: 4.5483, 83: 4.5477, 154: 4.4624, 155: 4.4608, 183: 4.4276, 195: 13.2318, 198: 8.8123, 229: 4.3445, 245: 4.3248, 252: 4.3144, 284: 4.2687, 302: 25.3899, 328: 4.1868, 358: 4.1483, 362: 4.1453, 371: 12.3617, 387: 4.0964, 404: 4.0732, 405: 4.0669, 406: 4.0665, 435: 4.003, 436: 8.0047, 453: 3.9712, 464: 3.946, 479: 3.9106, 484: 3.8969, 490: 7.7785, 494: 7.7627, 550: 3.7834, 587: 3.7105, 597: 3.6986, 604: 3.6889, 613: 3.6727, 620: 3.6542, 638: 3.6146, 644: 3.5975, 659: 3.5717, 664: 3.5658, 673: 17.6647, 676: 3.5231, 679: 3.5134, 696: 3.4617, 708: 3.4321, 710: 3.423, 714: 3.4127, 721: 3.3898, 724: 3.3868, 735: 10.0845, 740: 3.352, 743: 3.3477, 745: 6.6778, 756: 3.3135, 761: 3.2956, 770: 9.8462, 771: 3.2743, 784: 3.2475, 786: 3.2432, 788: 3.2346, 794: 12.9133, 799: 3.2119, 810: 9.5207, 812: 3.1726, 822: 3.1446, 833: 6.1729, 834: 9.2565, 835: 6.1642, 842: 3.0658, 847: 6.1114, 848: 12.2208

In [59]:
#b)- User profile using sampled users data over LDA vector

df_training_data_LDA = df_training_data_explode.join(df_lda_paper_topic_model,df_training_data_explode.training_data_paper_id == df_lda_paper_topic_model.id, how="inner").select(df_training_data_explode.user_hash_id,df_training_data_explode.training_data_paper_id,df_lda_paper_topic_model.topicDistribution)
    
df_training_data_userprofile_lda = df_training_data_LDA.selectExpr("user_hash_id","topicDistribution").rdd.reduceByKey(lambda x,y: addLDAPaperTopicVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_lda_vector")

                                                                                

In [60]:
df_training_data_userprofile_lda.take(1)

[Row(user_hash_id='891432aee06ca5ab755f90fbdce0fd6a', sum_lda_vector=DenseVector([0.0059, 0.6426, 0.006, 0.0059, 0.0058, 0.006, 0.0061, 0.0058, 0.0061, 0.0645, 0.006, 0.006, 0.006, 0.006, 0.2856, 0.006, 0.0059, 0.0059, 0.0059, 0.006, 0.0059, 0.0059, 0.1167, 0.0059, 0.006, 0.0059, 0.0059, 0.0059, 0.006, 0.0059, 0.0059, 0.006, 0.0058, 0.072, 0.006, 0.0059, 1.6164, 0.0061, 0.0059, 0.0059]))]

In [61]:
#Ex4.5 Off-line evaluation

##########  LDA  ########

#Caluclating recommendation for Sampled users training data

#Making the dataframe format compatible with CBRS function
df_UserRecord = df_training_data_userprofile_lda.join(df_training_data,df_training_data_userprofile_lda.user_hash_id == df_training_data.user_hash_id , how="inner")\
                .select(df_training_data.user_hash_id,df_training_data_userprofile_lda.sum_lda_vector,df_training_data.training_data.alias("user_library"))


In [62]:
#fetching top 50 recommendations
recommendation_with_lda=lda_CBRS(df_UserRecord,50)

                                                                                

In [63]:
recommendation_with_lda.show()

                                                                                

+--------------------+--------------------+
|        user_hash_id| recommended_library|
+--------------------+--------------------+
|e42e6dab1272a01f5...|[1211678, 472057,...|
|2ca707af72c903dd4...|[13417204, 293946...|
|655c3f173b8a1d5da...|[10476759, 101994...|
|fb2439610b1596017...|[2648418, 701619,...|
|d4df4f686c00cd6d7...|[6659199, 1433344...|
|7ff050e310c1db571...|[1624132, 7222445...|
|d54fb82bc061b5d5c...|[13751999, 939982...|
|4a85756cb50fbc7a5...|[4014916, 3483461...|
|9482bb4562752e454...|[5455667, 312301,...|
|f1deb990e614dffe0...|[9942019, 6670685...|
|57587e52a682ec365...|[7112872, 2941833...|
|78bc30f6cfb92beee...|[765084, 1512101,...|
|f486e0b0fb042bbdb...|[2104052, 4613938...|
|d5da87296aed37b4f...|[770071, 3149297,...|
|5396f8a35dd2076cd...|[5806819, 3505500...|
|3d453dce2e573c600...|[3885708, 3385785...|
|2482a7a5168747b30...|[1283433, 8423026...|
|b62fda7558c5a0854...|[697787, 1938465,...|
|2d8b61d140a0cdeed...|[206959, 4441479,...|
|a8971b1116533ad63...|[7333387, 

In [64]:
#Formating dataframe to the format accepted by function CalculatePreceisonAndRecall
df_lda_evaluationMetrics = recommendation_with_lda.join(df_test_data,recommendation_with_lda.user_hash_id==df_test_data.user_hash_id,how="inner")\
.select(df_test_data.user_hash_id,df_test_data.test_data.alias("test_set_user_library"),recommendation_with_lda.recommended_library.alias("top_recommendations"))


In [65]:
#When K=5
df_lda_evaluation_Kis5 = calculatePrecisionAndRecallMetrics(df_lda_evaluationMetrics,5)
df_lda_evaluation_Kis5 = df_lda_evaluation_Kis5.withColumn("MRR_Value",CalculateMRRValueUDF("top_recommendations","HitsList"))
df_lda_evaluation_Kis5.show()

                                                                                

+--------------------+---------------------+--------------------+--------+--------+---------------+------------+---------+
|        user_hash_id|test_set_user_library| top_recommendations|HitsList|NoOfHits|Precision_Value|Recall_Value|MRR_Value|
+--------------------+---------------------+--------------------+--------+--------+---------------+------------+---------+
|e42e6dab1272a01f5...|                   []|[1211678, 472057,...|      []|       0|            0.0|         0.0|        0|
|2ca707af72c903dd4...|            [3042410]|[13417204, 293946...|      []|       0|            0.0|         0.0|        0|
|655c3f173b8a1d5da...| [438096, 835275, ...|[10476759, 101994...|      []|       0|            0.0|         0.0|        0|
|fb2439610b1596017...|                   []|[2648418, 701619,...|      []|       0|            0.0|         0.0|        0|
|d4df4f686c00cd6d7...|                   []|[6659199, 1433344...|      []|       0|            0.0|         0.0|        0|
|7ff050e310c1db5

                                                                                

In [70]:
#When K=10
df_lda_evaluation_Kis10 = calculatePrecisionAndRecallMetrics(df_lda_evaluationMetrics,10)
df_lda_evaluation_Kis10 = df_lda_evaluation_Kis10.withColumn("MRR_Value",CalculateMRRValueUDF("top_recommendations","HitsList"))
df_lda_evaluation_Kis10.show()

                                                                                

+--------------------+---------------------+--------------------+--------+--------+---------------+------------+---------+
|        user_hash_id|test_set_user_library| top_recommendations|HitsList|NoOfHits|Precision_Value|Recall_Value|MRR_Value|
+--------------------+---------------------+--------------------+--------+--------+---------------+------------+---------+
|e42e6dab1272a01f5...|                   []|[1211678, 472057,...|      []|       0|            0.0|         0.0|        0|
|2ca707af72c903dd4...|            [3042410]|[13417204, 293946...|      []|       0|            0.0|         0.0|        0|
|655c3f173b8a1d5da...| [438096, 835275, ...|[10476759, 101994...|      []|       0|            0.0|         0.0|        0|
|fb2439610b1596017...|                   []|[2648418, 701619,...|      []|       0|            0.0|         0.0|        0|
|d4df4f686c00cd6d7...|                   []|[6659199, 1433344...|      []|       0|            0.0|         0.0|        0|
|7ff050e310c1db5

                                                                                

In [71]:
#When K=30
df_lda_evaluation_Kis30 = calculatePrecisionAndRecallMetrics(df_lda_evaluationMetrics,30)
df_lda_evaluation_Kis30 = df_lda_evaluation_Kis30.withColumn("MRR_Value",CalculateMRRValueUDF("top_recommendations","HitsList"))
df_lda_evaluation_Kis30.show()

                                                                                

+--------------------+---------------------+--------------------+--------+--------+---------------+------------+---------+
|        user_hash_id|test_set_user_library| top_recommendations|HitsList|NoOfHits|Precision_Value|Recall_Value|MRR_Value|
+--------------------+---------------------+--------------------+--------+--------+---------------+------------+---------+
|e42e6dab1272a01f5...|                   []|[1211678, 472057,...|      []|       0|            0.0|         0.0|        0|
|2ca707af72c903dd4...|            [3042410]|[13417204, 293946...|      []|       0|            0.0|         0.0|        0|
|655c3f173b8a1d5da...| [438096, 835275, ...|[10476759, 101994...|      []|       0|            0.0|         0.0|        0|
|fb2439610b1596017...|                   []|[2648418, 701619,...|      []|       0|            0.0|         0.0|        0|
|d4df4f686c00cd6d7...|                   []|[6659199, 1433344...|      []|       0|            0.0|         0.0|        0|
|7ff050e310c1db5

                                                                                

In [72]:
#Ex4.5 Off-line evaluation

##########  TF-IDF  ########

#Caluclating recommendation for Sampled users training data

#Making the dataframe format compatible with CBRS function
df_UserRecord = df_training_data_userprofile_tfidf.join(df_training_data,df_training_data_userprofile_tfidf.user_hash_id == df_training_data.user_hash_id , how="inner")\
                .select(df_training_data.user_hash_id,df_training_data_userprofile_tfidf.sum_tf_idf_vector,df_training_data.training_data.alias("user_library"))


In [73]:
#fetching top 50 recommendations
recommendation_with_tf_idf=tf_idf_CBRS(df_UserRecord,50)

                                                                                ]

In [74]:
#Formating dataframe to the format accepted by function CalculatePreceisonAndRecall
df_tfidf_evaluationMetrics = recommendation_with_tf_idf.join(df_test_data,recommendation_with_tf_idf.user_hash_id==df_test_data.user_hash_id,how="inner")\
.select(df_test_data.user_hash_id,df_test_data.test_data.alias("test_set_user_library"),recommendation_with_tf_idf.recommended_library.alias("top_recommendations"))


In [None]:
#When K=5
df_tfidf_evaluation_Kis5 = calculatePrecisionAndRecallMetrics(df_tfidf_evaluationMetrics,5)
df_tfidf_evaluation_Kis5 = df_tfidf_evaluation_Kis5.withColumn("MRR_Value",CalculateMRRValueUDF("top_recommendations","HitsList"))
df_tfidf_evaluation_Kis5.show()

[Stage 1060:(0 + 16) / 40][Stage 1062:> (0 + 0) / 2][Stage 1064:> (0 + 0) / 2]

In [None]:
#When K=10
df_tfidf_evaluation_Kis10 = calculatePrecisionAndRecallMetrics(df_tfidf_evaluationMetrics,10)
df_tfidf_evaluation_Kis10 = df_tfidf_evaluation_Kis10.withColumn("MRR_Value",CalculateMRRValueUDF("top_recommendations","HitsList"))
df_tfidf_evaluation_Kis10.show()

In [None]:
#When K=30
df_tfidf_evaluation_Kis30 = calculatePrecisionAndRecallMetrics(df_tfidf_evaluationMetrics,30)
df_tfidf_evaluation_Kis30 = df_tfidf_evaluation_Kis30.withColumn("MRR_Value",CalculateMRRValueUDF("top_recommendations","HitsList"))
df_tfidf_evaluation_Kis30.show()

In [66]:
#Metrics Average when K=5 for the 20 users

df_lda_evaluation_Kis5.agg({'Precision_Value': 'avg','Recall_Value': 'avg','MRR_Value': 'avg'}).show()

df_tfidf_evaluation_Kis5.agg({'Precision_Value': 'avg','Recall_Value': 'avg','MRR_Value': 'avg'}).show()



+--------------+--------------------+-----------------+
|avg(MRR_Value)|avg(Precision_Value)|avg(Recall_Value)|
+--------------+--------------------+-----------------+
|           0.0|                 0.0|              0.0|
+--------------+--------------------+-----------------+



                                                                                

In [None]:
#Metrics Average when K=10 for the 20 users

df_lda_evaluation_Kis10.agg({'Precision_Value': 'avg','Recall_Value': 'avg','MRR_Value': 'avg'}).show()

df_tfidf_evaluation_Kis10.agg({'Precision_Value': 'avg','Recall_Value': 'avg','MRR_Value': 'avg'}).show()

In [None]:
#Metrics Average when K=30 for the 20 users

df_lda_evaluation_Kis10.agg({'Precision_Value': 'avg','Recall_Value': 'avg','MRR_Value': 'avg'}).show()

df_tfidf_evaluation_Kis10.agg({'Precision_Value': 'avg','Recall_Value': 'avg','MRR_Value': 'avg'}).show()

#NOTE

Was not able to find the offline evaluation metrics for all the values of @K due to the memory issue/Processor throtling of my laptop for tf-idf however tried running on small dataset and it did work.