In [1]:
#Import all required packages
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.window import Window
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer,HashingTF
from pyspark.ml import Pipeline
import re
from nltk.stem import PorterStemmer
from pyspark.ml.linalg import Vectors
from pyspark.mllib.clustering import LDA
from pyspark.ml.feature import IDF

In [2]:
# Initialise Spark Session
spark = SparkSession.builder.appName("Experiment4").config("spark.sql.broadcastTimeout", "36000").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

21/07/23 14:24:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
#PREPARING DATAFRAMES FOR DATSETS

#Authors Dataframe
#df_authors = spark.read.csv("Datasets/authors.csv", sep = ",", header = True, quote = '"')

#PaperCsv dataframe
papersCsvSchema = StructType([
    StructField("paper_id",StringType(),False),
    StructField("type",StringType(),False),
    StructField("journal",StringType(),False),
    StructField("book_title",StringType(),False),
    StructField("series",StringType(),False),
    StructField("publisher",StringType(),False),
    StructField("pages",StringType(),False),
    StructField("volume",StringType(),False),
    StructField("number",StringType(),False),
    StructField("year",StringType(),False),
    StructField("month",StringType(),False),
    StructField("postedat",StringType(),False),
    StructField("address",StringType(),False),
    StructField("title",StringType(),False),
    StructField("abstract",StringType(),False),
])
df_paperCsv = spark.read.csv("Datasets/papers.csv", sep = ",", header = False, schema = papersCsvSchema, quote = '"')

#UserLibrary dataframe
userLibrarySchema = StructType([
    StructField("user_hash_id",StringType(),False),
    StructField("user_library",StringType(),False)
])
df_userLibrary = spark.read.csv("Datasets/users_libraries.txt", sep = ";", header = False, schema = userLibrarySchema)
df_userLibrary = df_userLibrary.selectExpr("user_hash_id","split(user_library,',') AS user_library")

In [4]:
#Cleaning and Tokenizing the data
def phraseTokenization(x):
    rawPhrase = x[13] + " " + x[14] #concatenating title and abstract
    rawPhrase = rawPhrase.replace("-","") #removing - from phrase
    rawPhrase = rawPhrase.replace("_","") #removing _ from phrase
    rawPhrase = rawPhrase.strip() #removing any trailing or leading whitespaces
    
    #spliting phrase based on non-alphaNumeric characters
    phraseArray = re.split('[^a-zA-Z0-9]+',rawPhrase) 
    
    #remove words with less than 3 char
    phraseArrayFilteredWords = [i for i in phraseArray if len(i) >= 3]
    
    return (x[0],list(phraseArrayFilteredWords))


df_tokenize = df_paperCsv.na.fill(value="").rdd.map(phraseTokenization).toDF()

#Removing StopWords using ML
swRemover = StopWordsRemover(inputCol="_2", outputCol="cleaned_terms")
df_cleanedData = swRemover.transform(df_tokenize)
df_cleanedData = df_cleanedData.selectExpr("_1 AS paper_id","cleaned_terms")

#Stemming using Porter stemmer Algo
ps =  PorterStemmer()

def stemmingTerms(x):
    stemmedWords = []
    for word in x:
        rootWord = ps.stem(word)
        stemmedWords.append(rootWord)
    return stemmedWords

df_cleanedData = df_cleanedData.rdd.mapValues(stemmingTerms).toDF().selectExpr("_1 AS paper_id","_2 AS cleaned_terms")

                                                                                

In [5]:
# Find the count of papers in which the term is present
df_paperCount = df_cleanedData.selectExpr("paper_id","explode(cleaned_terms) AS terms").distinct().groupBy("terms").count().withColumnRenamed("count", "paper_count")

#10 percent of total papers present in file
noOfDistinctPapers_df = int(df_cleanedData.select(countDistinct("paper_id")).collect()[0][0])
tenPercentOfTotalPapers = int(noOfDistinctPapers_df/10)

# remove words appear in more than 10% of the papers and keep only the words that appear in at least 20 papers 
df_filterdTerms = df_paperCount.filter((df_paperCount["paper_count"]<=tenPercentOfTotalPapers) & (df_paperCount["paper_count"]>=20))

#Fetch top 1000 terms 
top1000Terms = df_filterdTerms.orderBy(col("paper_count").desc()).limit(1000)

                                                                                

In [6]:
#associate unique integer values to each term
df_termsWithUniqueIndex = top1000Terms.withColumn("unique_index",row_number().over(Window.orderBy("paper_count"))).selectExpr("terms","unique_index-1 AS unique_index")

#Collect all terms in a list
terms_collection =  [row.terms for row in df_termsWithUniqueIndex.collect()]

# Generating Termfrequency Vector for each paper
df_cleanedDataExplode = df_cleanedData.selectExpr("paper_id","explode(cleaned_terms) AS terms")

#Getting the Unique_index of term 
df_cleanedDataJoinIndex = df_cleanedDataExplode.join(df_termsWithUniqueIndex,df_cleanedDataExplode.terms == df_termsWithUniqueIndex.terms , how = "inner").select(df_cleanedDataExplode.paper_id,df_cleanedDataExplode.terms,df_termsWithUniqueIndex.unique_index)

#Getting the term_frequency in each paper
df_cleanedDataJoinIndex = df_cleanedDataJoinIndex.groupBy("paper_id","unique_index").count().withColumnRenamed("count", "term_frquency")

#Creating a sparseVector respresentation for each paper
rdd_CleanedDataReducedByPaperId = df_cleanedDataJoinIndex.rdd.map(lambda x: (x[0], [(x[1], x[2])])).reduceByKey(lambda a, b: a + b)
rdd_CleanedDataReducedByPaperId = rdd_CleanedDataReducedByPaperId.map(lambda x: (x[0],Vectors.sparse(1000,x[1])))

df_CleanedDataSparseVector = rdd_CleanedDataReducedByPaperId.toDF().selectExpr("_1 AS paper_id","_2 AS term_frequency_vector")

21/07/23 14:25:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/07/23 14:26:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/07/23 14:27:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
21/07/23 14:27:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
21/07/23 14:27:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
21/07/23 14:27:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
21/07/23 14:27:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
21/07/23 14:27:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatc

In [7]:
df_CleanedDataSparseVector.take(1)

[Row(paper_id='498902', term_frequency_vector=SparseVector(1000, {33: 3.0, 47: 1.0, 79: 1.0, 97: 1.0, 138: 1.0, 170: 6.0, 354: 1.0, 368: 1.0, 394: 1.0, 482: 2.0, 491: 1.0, 541: 1.0, 550: 1.0, 566: 1.0, 581: 1.0, 596: 1.0, 622: 1.0, 632: 1.0, 663: 1.0, 670: 1.0, 720: 2.0, 723: 1.0, 762: 1.0, 764: 1.0, 773: 1.0, 797: 1.0, 820: 1.0, 826: 1.0, 837: 2.0, 843: 1.0, 879: 1.0, 881: 1.0, 890: 1.0, 892: 1.0, 894: 1.0, 928: 1.0, 937: 1.0, 946: 2.0, 949: 3.0, 952: 1.0, 962: 1.0, 965: 1.0, 984: 1.0, 985: 4.0, 990: 1.0, 992: 1.0}))]

In [8]:
#Ex 3.2

#TF-IDF Representation for each paper

idf = IDF(inputCol="term_frequency_vector", outputCol="tf_idf_vector")
tf_idf_model = idf.fit(df_CleanedDataSparseVector)
df_rescaledCleanedData = tf_idf_model.transform(df_CleanedDataSparseVector)
df_rescaledCleanedData = df_rescaledCleanedData.select("paper_id", "tf_idf_vector")

                                                                                

In [9]:
#TF-IDF Vector for papers
df_rescaledCleanedData.take(1)

[Row(paper_id='498902', tf_idf_vector=SparseVector(1000, {33: 13.8439, 47: 4.596, 79: 4.5494, 97: 4.5325, 138: 4.4866, 170: 26.6474, 354: 4.1522, 368: 4.1302, 394: 4.0846, 482: 7.8051, 491: 3.8889, 541: 3.7983, 550: 3.7834, 566: 3.7571, 581: 3.7152, 596: 3.6986, 622: 3.6484, 632: 3.6256, 663: 3.5658, 670: 3.5395, 720: 6.7871, 723: 3.3875, 762: 3.2929, 764: 3.2917, 773: 3.2717, 797: 3.2131, 820: 3.1488, 826: 3.1306, 837: 6.153, 843: 3.0651, 879: 2.9588, 881: 2.9516, 890: 2.9076, 892: 2.9043, 894: 2.8955, 928: 2.7393, 937: 2.6945, 946: 5.2459, 949: 7.8223, 952: 2.5913, 962: 2.5231, 965: 2.5148, 984: 2.3717, 985: 9.4584, 990: 2.328, 992: 2.3177}))]

In [10]:
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA

# Latent Direchlet Allocation

num_topics = 40

# Transform data into LDA supported format
df_lda_format = df_CleanedDataSparseVector.selectExpr("paper_id AS id","term_frequency_vector AS features")

lda = LDA(k=40)
lda_model = lda.fit(df_lda_format)
df_lda_paper_topic_model = lda_model.transform(df_lda_format)

21/07/23 14:27:54 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/07/23 14:27:54 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [11]:
#LDA Vector for Papers
df_lda_paper_topic_model.show()

+--------+--------------------+--------------------+
|      id|            features|   topicDistribution|
+--------+--------------------+--------------------+
|  498902|(1000,[33,47,79,9...|[4.03650087749653...|
| 1287740|(1000,[79,238,260...|[5.29822334292367...|
| 1727709|(1000,[24,55,80,1...|[4.62375661378785...|
| 1857331|(1000,[61,62,169,...|[0.22429080108466...|
|  201593|(1000,[38,90,104,...|[2.73423551399914...|
| 2090908|(1000,[177,280,44...|[0.23517238339902...|
|   23055|(1000,[24,56,225,...|[4.10161835414327...|
|  383220|(1000,[93,104,114...|[5.65153260076279...|
| 9106608|(1000,[8,27,70,89...|[2.11897089729369...|
|  460407|(1000,[45,101,139...|[2.67666537728423...|
| 2707871|(1000,[0,1,2,4,7,...|[2.14922757491267...|
| 2798913|(1000,[1,55,56,65...|[4.10161835414327...|
|  423550|(1000,[32,54,232,...|[4.03650087749653...|
| 6500865|(1000,[31,140,194...|[5.52864108584840...|
| 2945717|(1000,[217,229,24...|[0.08378920976069...|
|  674581|(1000,[3,26,154,1...|[4.986489678452

In [12]:
#User Profiling

#1)  produces a user profile for each user as the summation of the TF-IDF vectors of the papers that appear in the user’s library

df_userLibrary_explode = df_userLibrary.selectExpr("user_hash_id","explode(user_library) AS paper_id")
df_userJoined_TfIdf = df_userLibrary_explode.join(df_rescaledCleanedData,df_userLibrary_explode.paper_id == df_rescaledCleanedData.paper_id, how="inner").select(df_userLibrary_explode.user_hash_id,df_userLibrary_explode.paper_id,df_rescaledCleanedData.tf_idf_vector)

In [13]:
import collections

# Adding 2 sparse vectors
def addSparseVectors(v1, v2):
    values = collections.defaultdict(float) # Initialize Dictionary with default value 0.0
    
    # Add values from v1 SparseVector
    for i in range(v1.indices.size):
        values[v1.indices[i]] += v1.values[i]
    # Add values from v2 SParseVector
    for i in range(v2.indices.size):
        values[v2.indices[i]] += v2.values[i]
    return Vectors.sparse(v1.size, dict(values))

#final Df : summation of TF-IDF vector for each userlibrary
df_userprofile_tfidf = df_userJoined_TfIdf.selectExpr("user_hash_id","tf_idf_vector").rdd.reduceByKey(lambda x,y: addSparseVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_tf_idf_vector")

                                                                                

In [14]:
df_userprofile_tfidf.take(1)

                                                                                

[Row(user_hash_id='6931f7f79678cf72aae416ff7cb43bb1', sum_tf_idf_vector=SparseVector(1000, {0: 13.9931, 1: 23.287, 3: 9.3123, 4: 37.249, 5: 4.6555, 6: 4.6536, 7: 32.5708, 10: 9.2996, 11: 4.6492, 12: 23.2428, 13: 4.6467, 14: 41.8145, 18: 9.2734, 20: 4.633, 22: 4.6305, 23: 41.6692, 24: 4.6299, 26: 97.1893, 27: 9.2561, 28: 37.0147, 29: 4.6256, 31: 4.6238, 33: 4.6146, 34: 13.8384, 35: 4.6122, 36: 13.8293, 37: 23.0429, 38: 13.8257, 39: 9.2147, 41: 4.6044, 42: 9.2087, 47: 18.3839, 48: 9.192, 51: 13.7737, 52: 4.5889, 53: 4.5889, 55: 13.756, 56: 4.583, 57: 9.1601, 60: 13.7331, 62: 13.7243, 65: 4.5707, 67: 9.1298, 68: 4.5643, 70: 13.6809, 72: 173.1171, 74: 95.6219, 75: 36.4183, 76: 9.1034, 77: 4.5511, 78: 4.55, 79: 13.6483, 80: 9.0989, 81: 9.0977, 84: 4.5466, 86: 40.8787, 87: 18.166, 88: 4.5415, 89: 4.5409, 92: 27.2389, 93: 72.5921, 94: 31.7512, 95: 4.5353, 99: 18.1279, 100: 9.0628, 102: 9.0484, 104: 4.5236, 105: 9.045, 107: 9.0362, 110: 4.5153, 111: 13.5444, 113: 4.5148, 115: 9.023, 116: 9.023

In [15]:
#2  LDA-based profiles for each user as the summation of the paper-topics vectors of the papers 

df_userJoined_LDA = df_userLibrary_explode.join(df_lda_paper_topic_model,df_userLibrary_explode.paper_id == df_lda_paper_topic_model.id, how="inner").select(df_userLibrary_explode.user_hash_id,df_userLibrary_explode.paper_id,df_lda_paper_topic_model.topicDistribution)

In [16]:
#function to add 2 LDA dense vectors
def addLDAPaperTopicVectors(v1,v2):
    return (v1+v2)
    
df_userprofile_lda = df_userJoined_LDA.selectExpr("user_hash_id","topicDistribution").rdd.reduceByKey(lambda x,y: addLDAPaperTopicVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_lda_vector")

                                                                                

In [17]:
df_userprofile_lda.take(1)

[Row(user_hash_id='6931f7f79678cf72aae416ff7cb43bb1', sum_lda_vector=DenseVector([0.1754, 1.0947, 0.1784, 0.3062, 0.1359, 3.9344, 0.4657, 0.1238, 0.5484, 0.3686, 0.4487, 0.4528, 0.6498, 0.1356, 0.464, 0.2493, 0.7315, 0.3416, 0.381, 1.4015, 0.3333, 0.9788, 0.4322, 0.1196, 0.1184, 16.3355, 0.5066, 0.929, 0.9196, 0.5298, 0.9098, 6.4886, 0.1819, 0.933, 0.119, 0.1212, 9.7864, 0.3584, 15.1234, 0.1883]))]

In [18]:
#Ex 3.5 
# 1: Randomly selects n users
fraction_of_users = 0.5
df_sampleUsers = df_userLibrary.sample(withReplacement=False, fraction=fraction_of_users)


In [19]:
#Ex3.5
#2 Divide sampled data into raining set and test set for each user
import math
import random

def divideList(masterList,trainingSetFraction):
    size_masterList = len(masterList)
    size_TrainingList = int(math.ceil(trainingSetFraction * size_masterList))
    trainingList = masterList[:size_TrainingList]
    testList = masterList[size_TrainingList:]
    return [trainingList,testList]

df_divide_train_test_data = df_sampleUsers.rdd.map(lambda x: (x[0],divideList(x[1],0.8))).toDF()
df_divide_train_test_data = df_divide_train_test_data.select(df_divide_train_test_data._1.alias("user_hash_id"),df_divide_train_test_data._2[0].alias("training_data"),df_divide_train_test_data._2[1].alias("test_data"))

df_training_data = df_divide_train_test_data.selectExpr("user_hash_id","training_data")
df_test_data = df_divide_train_test_data.selectExpr("user_hash_id","test_data")


In [20]:
df_training_data.show()

+--------------------+--------------------+
|        user_hash_id|       training_data|
+--------------------+--------------------+
|f05bcffe7951de9e5...|[1158654, 478707,...|
|ca4f1ba4094011d9a...|            [278019]|
|d1d41a15201915503...|[6610569, 6493797...|
|b656009a6efdc8b1a...|[771870, 181369, ...|
|cf9c7f356092c34be...|             [90558]|
|d85f7d83f27b3f533...|[7610843, 3633347...|
|10fdfaf945d5c27ad...|           [2010550]|
|7e070a9da96672e05...|           [1071959]|
|3b715ebaf1f8f81a1...|[4119394, 3378798...|
|488fb15e8c77f8054...|[1523301, 5281566...|
|c6b59086a0bbac141...|[2230995, 3050075...|
|0ad6516296d95068c...|[2734645, 1218426...|
|f3c28e50db4ce8ad8...|[2856540, 2994495...|
|38fe6373389d12b5b...|[7276116, 255799,...|
|60a321bf89d186c88...|   [1391145, 467189]|
|b36c3189bb1457cd0...|[2270229, 6417010...|
|7c3219ff9046172ce...|[1089965, 1089982...|
|7c0081293b3988065...|[3453059, 3007833...|
|4c8912d1b04471cf5...|[3579579, 1931121...|
|1291485dbe1a86857...|[4521968, 

In [21]:
df_test_data.show()

+--------------------+--------------------+
|        user_hash_id|           test_data|
+--------------------+--------------------+
|f05bcffe7951de9e5...|[1453872, 671699,...|
|ca4f1ba4094011d9a...|                  []|
|d1d41a15201915503...|[7465494, 7329626...|
|b656009a6efdc8b1a...|[1129585, 404679,...|
|cf9c7f356092c34be...|                  []|
|d85f7d83f27b3f533...|[3585, 1458475, 1...|
|10fdfaf945d5c27ad...|                  []|
|7e070a9da96672e05...|                  []|
|3b715ebaf1f8f81a1...|[3399487, 3150261...|
|488fb15e8c77f8054...|            [352724]|
|c6b59086a0bbac141...|  [1429875, 2406965]|
|0ad6516296d95068c...|   [179911, 1127804]|
|f3c28e50db4ce8ad8...|[2881619, 2913739...|
|38fe6373389d12b5b...|[10064582, 740518...|
|60a321bf89d186c88...|                  []|
|b36c3189bb1457cd0...|               [151]|
|7c3219ff9046172ce...|[2301235, 1089946...|
|7c0081293b3988065...|[1274492, 2765646...|
|4c8912d1b04471cf5...|[2050637, 100088,...|
|1291485dbe1a86857...|  [1442986

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 643, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError


In [22]:
#Ex 3.5

#Creating User profiles for Training data set
df_training_data_explode = df_training_data.selectExpr("user_hash_id","explode(training_data) AS training_data_paper_id")

#a- user profile using sampled users data over Tf-Idf vector

df_training_data_TfIdf = df_training_data_explode.join(df_rescaledCleanedData,df_training_data_explode.training_data_paper_id == df_rescaledCleanedData.paper_id, how="inner").select(df_training_data_explode.user_hash_id,df_training_data_explode.training_data_paper_id,df_rescaledCleanedData.tf_idf_vector)
df_training_data_userprofile_tfidf = df_training_data_TfIdf.selectExpr("user_hash_id","tf_idf_vector").rdd.reduceByKey(lambda x,y: addSparseVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_tf_idf_vector")

                                                                                

In [23]:
#b)- User profile using sampled users data over LDA vector

df_training_data_LDA = df_training_data_explode.join(df_lda_paper_topic_model,df_training_data_explode.training_data_paper_id == df_lda_paper_topic_model.id, how="inner").select(df_training_data_explode.user_hash_id,df_training_data_explode.training_data_paper_id,df_lda_paper_topic_model.topicDistribution)
def addLDAPaperTopicVectors(v1,v2):
    return (v1+v2)
    
df_training_data_userprofile_lda = df_training_data_LDA.selectExpr("user_hash_id","topicDistribution").rdd.reduceByKey(lambda x,y: addLDAPaperTopicVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_lda_vector")

                                                                                

In [24]:
df_training_data_userprofile_lda.take(1)

[Row(user_hash_id='f0fe2969ce44deace9595fabd7c12bdf', sum_lda_vector=DenseVector([23.6102, 1.6307, 12.4924, 11.9339, 2.5502, 3.2708, 40.4139, 11.0825, 8.5738, 5.9339, 7.6117, 1.6016, 2.4992, 6.6816, 25.7266, 8.4814, 14.6735, 7.1139, 6.0141, 5.8428, 14.739, 4.3305, 37.0701, 5.0813, 5.5494, 1.3507, 6.554, 4.8049, 4.6017, 4.5837, 11.1863, 4.1229, 4.1246, 7.6146, 5.3945, 3.5363, 1.3626, 5.0872, 1.393, 4.7739]))]

In [25]:
#Ex 4.2 Funtion to calculate cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

# UDF for calculating cosine similarity

#def cos_similarity(a,b):
#    cosSimValue = float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
#    return cosSimValue

def calculateCosineSimilarity(userVector,paperVector):
    cosineSimilarityValue = float(cosine_similarity([userVector],[paperVector])[0,0])
    return cosineSimilarityValue


In [41]:
#Ex 4.3 
#a)

#Function for Content Based Recommendation on TF-IDF User Profile
def tf_idf_CBRS(userId,numberOfRecommendation):
    
    #Fetch the records for the Particular User along with User Library
    df_UserRecord = df_userLibrary.filter(df_userLibrary.user_hash_id == userId)
    
    #Fetching the User TF-IDF 
    df_UserRecord = df_UserRecord.join(df_userprofile_tfidf, df_UserRecord.user_hash_id == df_userprofile_tfidf.user_hash_id, how="inner").select(df_UserRecord.user_hash_id,df_userprofile_tfidf.sum_tf_idf_vector,df_UserRecord.user_library)
    
    #Fetching the Papers along with their TF-IDF excluding the papers which are already present in user Library
    list_userPaperLibrary = list(df_UserRecord.collect()[0][2])
    df_DeltaPapers = df_rescaledCleanedData.filter(~df_rescaledCleanedData.paper_id.isin(list_userPaperLibrary))
    
    #Repartioning the data
    df_UserRecord = df_UserRecord.repartition(40)
    df_DeltaPapers = df_DeltaPapers.repartition(40)
    
    #Cross Join User with each paper
    df_User_XJoin_Papers = df_UserRecord.selectExpr("user_hash_id","sum_tf_idf_vector AS user_tf_idf_vector").crossJoin(df_DeltaPapers)
    
    #Calculate the Cosine Similarity
    df_cosine_similarity_result = df_User_XJoin_Papers.rdd.map(lambda x: (x[0],x[2],calculateCosineSimilarity(x[1],x[3]))).toDF(schema=['user_hash_id', 'paper_id', 'cosine_similarity_value'])
    
    #Fetch the Top K recommendations for the User
    df_top_k_recommendations = df_cosine_similarity_result.orderBy(col("cosine_similarity_value").desc()).limit(numberOfRecommendation)
    df_top_k_recommendations = df_top_k_recommendations.groupBy("user_hash_id").agg(collect_list("paper_id").alias("recommended_library"),collect_list("cosine_similarity_value").alias("cosine_similarity_recommended_library"))
    
    return df_top_k_recommendations

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)


In [42]:
#Ex4.3 b)

#Function for Content Based Recommendation on LDA User Profile
def lda_CBRS(userId,numberOfRecommendation):
    
    #Fetch the records for the Particular User along with User Library
    df_UserRecord = df_userLibrary.filter(df_userLibrary.user_hash_id == userId)
    
    #Fetching the User LDA Vector 
    df_UserRecord = df_UserRecord.join(df_userprofile_lda, df_UserRecord.user_hash_id == df_userprofile_lda.user_hash_id, how="inner").select(df_UserRecord.user_hash_id,df_userprofile_lda.sum_lda_vector,df_UserRecord.user_library)
    
    #Fetching the Papers along with their LDA excluding the papers which are already present in user Library
    list_userPaperLibrary = list(df_UserRecord.collect()[0][2])
    df_DeltaPapers = df_lda_paper_topic_model.filter(~df_lda_paper_topic_model.id.isin(list_userPaperLibrary)).selectExpr("id AS paper_id","topicDistribution")
    
    #Repartioning the data
    df_UserRecord = df_UserRecord.repartition(40)
    df_DeltaPapers = df_DeltaPapers.repartition(40)
    
    #Cross Join User with each paper
    df_User_XJoin_Papers = df_UserRecord.selectExpr("user_hash_id","sum_lda_vector AS user_lda_vector").crossJoin(df_DeltaPapers)
    
    #Calculate the Cosine Similarity
    df_cosine_similarity_result = df_User_XJoin_Papers.rdd.map(lambda x: (x[0],x[2],calculateCosineSimilarity(x[1],x[3]))).toDF(schema=['user_hash_id', 'paper_id', 'cosine_similarity_value'])
    
    #Fetch the Top K recommendations for the User
    df_top_k_recommendations = df_cosine_similarity_result.orderBy(col("cosine_similarity_value").desc()).limit(numberOfRecommendation)
    #list_top_k_recommendations = list(df_top_k_recommendations.selectExpr("paper_id").toPandas()['paper_id'])
    df_top_k_recommendations = df_top_k_recommendations.groupBy("user_hash_id").agg(collect_list("paper_id").alias("recommended_library"),collect_list("cosine_similarity_value").alias("cosine_similarity_recommended_library"))
    
    return df_top_k_recommendations

In [43]:
#Ex 4.3 c) Top K recommendations for User = 1eac022a97d683eace8815545ce3153f

user_hash_id = '1eac022a97d683eace8815545ce3153f'
k = 10 #No Of Recommendation

#Recommendation using TF-IDF Vector
recommendation_with_tf_idf=tf_idf_CBRS(user_hash_id,k)


                                                                                

In [49]:
#Top 10 Recommendation for User:1eac022a97d683eace8815545ce3153f with TF-IDF 
recommendation_with_tf_idf.show(truncate=False)



+--------------------------------+------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_hash_id                    |recommended_library                                                                       |cosine_similarity_recommended_library                                                                                                                                                                 |
+--------------------------------+------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1eac022a97d683eace881554

                                                                                

In [45]:
## Recommendation using LDA vector
recommendation_with_lda=lda_CBRS(user_hash_id,k)


                                                                                

In [50]:
#Top 10 Recommendation for User:1eac022a97d683eace8815545ce3153f with LDA
recommendation_with_lda.show(truncate=False)



+--------------------------------+------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_hash_id                    |recommended_library                                                                 |cosine_similarity_recommended_library                                                                                                                                                                 |
+--------------------------------+------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1eac022a97d683eace8815545ce3153f|[1667053,

                                                                                