In [1]:
#Import all required packages
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.window import Window
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer,HashingTF
from pyspark.ml import Pipeline
import re
from nltk.stem import PorterStemmer
from pyspark.ml.linalg import Vectors
from pyspark.mllib.clustering import LDA
from pyspark.ml.feature import IDF

In [2]:
# Initialise Spark Session
spark = SparkSession.builder.appName("Experiment3").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [3]:
#PREPARING DATAFRAMES FOR DATSETS

#Authors Dataframe
#df_authors = spark.read.csv("Datasets/authors.csv", sep = ",", header = True, quote = '"')

#PaperCsv dataframe
papersCsvSchema = StructType([
    StructField("paper_id",StringType(),False),
    StructField("type",StringType(),False),
    StructField("journal",StringType(),False),
    StructField("book_title",StringType(),False),
    StructField("series",StringType(),False),
    StructField("publisher",StringType(),False),
    StructField("pages",StringType(),False),
    StructField("volume",StringType(),False),
    StructField("number",StringType(),False),
    StructField("year",StringType(),False),
    StructField("month",StringType(),False),
    StructField("postedat",StringType(),False),
    StructField("address",StringType(),False),
    StructField("title",StringType(),False),
    StructField("abstract",StringType(),False),
])
df_paperCsv = spark.read.csv("Datasets/papers.csv", sep = ",", header = False, schema = papersCsvSchema, quote = '"')

#UserLibrary dataframe
userLibrarySchema = StructType([
    StructField("user_hash_id",StringType(),False),
    StructField("user_library",StringType(),False)
])
df_userLibrary = spark.read.csv("Datasets/users_libraries.txt", sep = ";", header = False, schema = userLibrarySchema)
df_userLibrary = df_userLibrary.selectExpr("user_hash_id","split(user_library,',') AS user_library")


##Keywords dataframe
#
#keywordsSchema = StructType([
#    StructField("paper_id",StringType(),False),
#    StructField("keyword",StringType(),False)
#])
#
#df_keywords = spark.read.csv("Datasets/keywords.csv", sep = ",", header = True, schema = keywordsSchema, quote = '"')
#
##Terms Dataframe
#terms_df = spark.read.csv("Datasets/terms.txt", header = True)
#
##Stopword Broadcast
#stopWords = sc.textFile("Datasets/stopwords_en.txt")
#stopWordsBroadcast = sc.broadcast(stopWords.collect())
#
##Papers Terms Dataframe
#def parse_papers_count(line):
#
#    if not line:
#        return dict()
#    papers_count_raw = line.split(' ')
#    papers_count = dict()
#    for pcRaw in papers_count_raw:
#        paper, count = pcRaw.split(':')
#        papers_count[paper] = int(count)
#    return papers_count
#
#papers_vocab = spark.read.format("csv").option("header", "true").load("Datasets/papers_terms.txt").rdd
#papers_vocab = papers_vocab.map(lambda x: (x[0], parse_papers_count(x[1]))).toDF().selectExpr('_1 AS paper_id','_2 AS term_count')    


In [4]:
#Ex 3.1: 1,2,3,4,5,6

#Cleaning and Tokenizing the data
def phraseTokenization(x):
    rawPhrase = x[13] + " " + x[14] #concatenating title and abstract
    rawPhrase = rawPhrase.replace("-","") #removing - from phrase
    rawPhrase = rawPhrase.replace("_","") #removing _ from phrase
    rawPhrase = rawPhrase.strip() #removing any trailing or leading whitespaces
    
    #spliting phrase based on non-alphaNumeric characters
    phraseArray = re.split('[^a-zA-Z0-9]+',rawPhrase) 
    
    #remove words with less than 3 char
    phraseArrayFilteredWords = [i for i in phraseArray if len(i) >= 3]
    
    return (x[0],list(phraseArrayFilteredWords))


df_tokenize = df_paperCsv.na.fill(value="").rdd.map(phraseTokenization).toDF()

#Removing StopWords using ML
swRemover = StopWordsRemover(inputCol="_2", outputCol="cleaned_terms")
df_cleanedData = swRemover.transform(df_tokenize)
df_cleanedData = df_cleanedData.selectExpr("_1 AS paper_id","cleaned_terms")

#Stemming using Porter stemmer Algo
ps =  PorterStemmer()

def stemmingTerms(x):
    stemmedWords = []
    for word in x:
        rootWord = ps.stem(word)
        stemmedWords.append(rootWord)
    return stemmedWords

df_cleanedData = df_cleanedData.rdd.mapValues(stemmingTerms).toDF().selectExpr("_1 AS paper_id","_2 AS cleaned_terms")

In [5]:
#Ex 3.1 - 7,8
# Find the count of papers in which the term is present
df_paperCount = df_cleanedData.selectExpr("paper_id","explode(cleaned_terms) AS terms").distinct().groupBy("terms").count().withColumnRenamed("count", "paper_count")

#10 percent of total papers present in file
noOfDistinctPapers_df = int(df_cleanedData.select(countDistinct("paper_id")).collect()[0][0])
tenPercentOfTotalPapers = int(noOfDistinctPapers_df/10)

# remove words appear in more than 10% of the papers and keep only the words that appear in at least 20 papers 
df_filterdTerms = df_paperCount.filter((df_paperCount["paper_count"]<=tenPercentOfTotalPapers) & (df_paperCount["paper_count"]>=20))

#Fetch top 1000 terms 
top1000Terms = df_filterdTerms.orderBy(col("paper_count").desc()).limit(1000)

In [6]:
#Ex 3.1-1,2,3

#associate unique integer values to each term
df_termsWithUniqueIndex = top1000Terms.withColumn("unique_index",row_number().over(Window.orderBy("paper_count"))).selectExpr("terms","unique_index-1 AS unique_index")

#Collect all terms in a list
terms_collection =  [row.terms for row in df_termsWithUniqueIndex.collect()]

# Generating Termfrequency Vector for each paper
df_cleanedDataExplode = df_cleanedData.selectExpr("paper_id","explode(cleaned_terms) AS terms")

#Getting the Unique_index of term 
df_cleanedDataJoinIndex = df_cleanedDataExplode.join(df_termsWithUniqueIndex,df_cleanedDataExplode.terms == df_termsWithUniqueIndex.terms , how = "inner").select(df_cleanedDataExplode.paper_id,df_cleanedDataExplode.terms,df_termsWithUniqueIndex.unique_index)

#Getting the term_frequency in each paper
df_cleanedDataJoinIndex = df_cleanedDataJoinIndex.groupBy("paper_id","unique_index").count().withColumnRenamed("count", "term_frquency")

#Creating a sparseVector respresentation for each paper
rdd_CleanedDataReducedByPaperId = df_cleanedDataJoinIndex.rdd.map(lambda x: (x[0], [(x[1], x[2])])).reduceByKey(lambda a, b: a + b)
rdd_CleanedDataReducedByPaperId = rdd_CleanedDataReducedByPaperId.map(lambda x: (x[0],Vectors.sparse(1000,x[1])))

df_CleanedDataSparseVector = rdd_CleanedDataReducedByPaperId.toDF().selectExpr("_1 AS paper_id","_2 AS term_frequency_vector")

In [7]:
df_CleanedDataSparseVector.take(1)

[Row(paper_id='498902', term_frequency_vector=SparseVector(1000, {33: 3.0, 47: 1.0, 79: 1.0, 97: 1.0, 138: 1.0, 170: 6.0, 354: 1.0, 368: 1.0, 394: 1.0, 482: 2.0, 491: 1.0, 541: 1.0, 550: 1.0, 566: 1.0, 581: 1.0, 596: 1.0, 622: 1.0, 632: 1.0, 663: 1.0, 670: 1.0, 720: 2.0, 723: 1.0, 762: 1.0, 764: 1.0, 773: 1.0, 797: 1.0, 820: 1.0, 826: 1.0, 837: 2.0, 843: 1.0, 879: 1.0, 881: 1.0, 890: 1.0, 892: 1.0, 894: 1.0, 928: 1.0, 937: 1.0, 946: 2.0, 949: 3.0, 952: 1.0, 962: 1.0, 965: 1.0, 984: 1.0, 985: 4.0, 990: 1.0, 992: 1.0}))]

In [8]:
#Ex 3.2

#TF-IDF Representation for each paper

idf = IDF(inputCol="term_frequency_vector", outputCol="tf_idf_vector")
tf_idf_model = idf.fit(df_CleanedDataSparseVector)
df_rescaledCleanedData = tf_idf_model.transform(df_CleanedDataSparseVector)
df_rescaledCleanedData = df_rescaledCleanedData.select("paper_id", "tf_idf_vector")

In [9]:
df_rescaledCleanedData.show()

+--------+--------------------+
|paper_id|       tf_idf_vector|
+--------+--------------------+
|  498902|(1000,[33,47,79,9...|
|  201593|(1000,[38,90,104,...|
| 1727709|(1000,[24,55,80,1...|
|10101645|(1000,[244,246,29...|
|10886724|(1000,[66,96,147,...|
|12197482|(1000,[28,39,131,...|
| 1287740|(1000,[79,238,260...|
|  168969|(1000,[0,174,196,...|
| 1857331|(1000,[61,62,169,...|
| 1866283|(1000,[134,214,23...|
| 2090908|(1000,[177,280,44...|
|   23055|(1000,[24,56,225,...|
| 2323621|(1000,[119,162,26...|
| 2594134|(1000,[69,76,90,1...|
| 2707871|(1000,[0,1,2,4,7,...|
| 2798913|(1000,[1,55,56,65...|
| 2945717|(1000,[217,229,24...|
|  306396|(1000,[89,150,202...|
| 3299623|(1000,[157,160,19...|
|  383220|(1000,[93,104,114...|
+--------+--------------------+
only showing top 20 rows



In [10]:
#Ex 3.3

#from pyspark.mllib.linalg import Vectors as MLlibVectors
#
## Latent Direchlet Allocation
#
#num_topics = 40
#
## Transform data into LDA supported format
#rdd_lda_format = df_CleanedDataSparseVector.rdd.mapValues(MLlibVectors.fromML).map(lambda x: [int(x[0]),x[1]])
#
##Train the LDA Model
#lda_model = LDA.train(rdd_lda_format, k=num_topics)

from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA

# Latent Direchlet Allocation

num_topics = 40

# Transform data into LDA supported format
df_lda_format = df_CleanedDataSparseVector.selectExpr("paper_id AS id","term_frequency_vector AS features")

lda = LDA(k=40)
lda_model = lda.fit(df_lda_format)

In [11]:
df_lda_format.show()

+--------+--------------------+
|      id|            features|
+--------+--------------------+
|  498902|(1000,[33,47,79,9...|
|  201593|(1000,[38,90,104,...|
| 1727709|(1000,[24,55,80,1...|
|10101645|(1000,[244,246,29...|
|10886724|(1000,[66,96,147,...|
|12197482|(1000,[28,39,131,...|
| 1287740|(1000,[79,238,260...|
|  168969|(1000,[0,174,196,...|
| 1857331|(1000,[61,62,169,...|
| 1866283|(1000,[134,214,23...|
| 2090908|(1000,[177,280,44...|
|   23055|(1000,[24,56,225,...|
| 2323621|(1000,[119,162,26...|
| 2594134|(1000,[69,76,90,1...|
| 2707871|(1000,[0,1,2,4,7,...|
| 2798913|(1000,[1,55,56,65...|
| 2945717|(1000,[217,229,24...|
|  306396|(1000,[89,150,202...|
| 3299623|(1000,[157,160,19...|
|  383220|(1000,[93,104,114...|
+--------+--------------------+
only showing top 20 rows



In [12]:
#Ex 3.3
##Find the top 5 term of each extracted Topic

#topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = 5))
#
#def Get_TopicTerms(topic):
#    termsId_array = topic[0] 
#    terms_array = []
#    
#    #Finding the term from the their corresponding Unique index
#    for i in range(5):
#        term = terms_collection[termsId_array[i]]
#        terms_array.append(term)
#    return terms_array
#
#topics_Final = topicIndices.map(Get_TopicTerms).collect()

#Find the top 5 term of each extracted Topic
topicIndices = lda_model.describeTopics(maxTermsPerTopic = 5)

def Get_TopicTerms(topic):
    termsId_array = topic[1] 
    terms_array = []
    
    #Finding the term from the their corresponding Unique index
    for i in range(5):
        term = terms_collection[termsId_array[i]]
        terms_array.append(term)
    return (topic[0],terms_array)

topics_Final = topicIndices.rdd.map(Get_TopicTerms).toDF().selectExpr("_1 AS topic","_2 AS top_5_terms")

df_lda_paper_topic_model = lda_model.transform(df_lda_format) # preparing it for Ex 3.5

In [13]:
topics_Final.show()

+-----+--------------------+
|topic|         top_5_terms|
+-----+--------------------+
|    0|[program, behavio...|
|    1|[technolog, digit...|
|    2|[group, children,...|
|    3|[chang, climat, t...|
|    4|[protein, simul, ...|
|    5|[electron, atom, ...|
|    6|[protein, interac...|
|    7|[theori, mathemat...|
|    8|[gene, express, n...|
|    9|[time, mine, extr...|
|   10|[social, knowledg...|
|   11|[diseas, patient,...|
|   12|[estim, statist, ...|
|   13|[read, sequenc, m...|
|   14|[evolut, anim, re...|
|   15|[tool, set, avail...|
|   16|[scienc, environm...|
|   17|[measur, phase, l...|
|   18|[genom, dna, sequ...|
|   19|[memori, state, q...|
+-----+--------------------+
only showing top 20 rows



In [14]:
#Ex3.4

#User Profiling

#1)  produces a user profile for each user as the summation of the TF-IDF vectors of the papers that appear in the user’s library

df_userLibrary_explode = df_userLibrary.selectExpr("user_hash_id","explode(user_library) AS paper_id")
df_userJoined_TfIdf = df_userLibrary_explode.join(df_rescaledCleanedData,df_userLibrary_explode.paper_id == df_rescaledCleanedData.paper_id, how="inner").select(df_userLibrary_explode.user_hash_id,df_userLibrary_explode.paper_id,df_rescaledCleanedData.tf_idf_vector)

In [15]:
import collections

# Adding 2 sparse vectors
def addSparseVectors(v1, v2):
    values = collections.defaultdict(float) # Initialize Dictionary with default value 0.0
    
    # Add values from v1 SparseVector
    for i in range(v1.indices.size):
        values[v1.indices[i]] += v1.values[i]
    # Add values from v2 SParseVector
    for i in range(v2.indices.size):
        values[v2.indices[i]] += v2.values[i]
    return Vectors.sparse(v1.size, dict(values))

#final Df : summation of TF-IDF vector for each userlibrary
df_userprofile_tfidf = df_userJoined_TfIdf.selectExpr("user_hash_id","tf_idf_vector").rdd.reduceByKey(lambda x,y: addSparseVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_tf_idf_vector")

In [16]:
df_userprofile_tfidf.take(1)

[Row(user_hash_id='6931f7f79678cf72aae416ff7cb43bb1', sum_tf_idf_vector=SparseVector(1000, {0: 13.9931, 1: 23.287, 3: 9.3123, 4: 37.249, 5: 4.6555, 6: 4.6536, 7: 32.5708, 10: 9.2996, 11: 4.6492, 12: 23.2428, 13: 4.6467, 14: 41.8145, 18: 9.2734, 20: 4.633, 22: 4.6305, 23: 41.6692, 24: 4.6299, 26: 97.1893, 27: 9.2561, 28: 37.0147, 29: 4.6256, 31: 4.6238, 33: 4.6146, 34: 13.8384, 35: 4.6122, 36: 13.8293, 37: 23.0429, 38: 13.8257, 39: 9.2147, 41: 4.6044, 42: 9.2087, 47: 18.3839, 48: 9.192, 51: 13.7737, 52: 4.5889, 53: 4.5889, 55: 13.756, 56: 4.583, 57: 9.1601, 60: 13.7331, 62: 13.7243, 65: 4.5707, 67: 9.1298, 68: 4.5643, 70: 13.6809, 72: 173.1171, 74: 95.6219, 75: 36.4183, 76: 9.1034, 77: 4.5511, 78: 4.55, 79: 13.6483, 80: 9.0989, 81: 9.0977, 84: 4.5466, 86: 40.8787, 87: 18.166, 88: 4.5415, 89: 4.5409, 92: 27.2389, 93: 72.5921, 94: 31.7512, 95: 4.5353, 99: 18.1279, 100: 9.0628, 102: 9.0484, 104: 4.5236, 105: 9.045, 107: 9.0362, 110: 4.5153, 111: 13.5444, 113: 4.5148, 115: 9.023, 116: 9.023

In [17]:
#2  LDA-based profiles for each user as the summation of the paper-topics vectors of the papers 

df_userJoined_LDA = df_userLibrary_explode.join(df_lda_paper_topic_model,df_userLibrary_explode.paper_id == df_lda_paper_topic_model.id, how="inner").select(df_userLibrary_explode.user_hash_id,df_userLibrary_explode.paper_id,df_lda_paper_topic_model.topicDistribution)

In [18]:
#function to add 2 LDA dense vectors
def addLDAPaperTopicVectors(v1,v2):
    return (v1+v2)
    
df_userprofile_lda = df_userJoined_LDA.selectExpr("user_hash_id","topicDistribution").rdd.reduceByKey(lambda x,y: addLDAPaperTopicVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_lda_vector")

In [19]:
df_userprofile_lda.take(1)

[Row(user_hash_id='6931f7f79678cf72aae416ff7cb43bb1', sum_lda_vector=DenseVector([0.1391, 1.7382, 0.3115, 0.2975, 0.5515, 0.2997, 0.5655, 0.1219, 4.5555, 0.1737, 0.2066, 9.4012, 0.3132, 4.2154, 1.0348, 0.2011, 0.386, 0.4549, 6.7044, 0.2093, 0.5311, 0.232, 0.9089, 0.5781, 0.1228, 0.1498, 4.3661, 0.3744, 1.9279, 9.1491, 0.1199, 0.1232, 0.9832, 0.6777, 0.1578, 0.4714, 0.1201, 14.4807, 0.28, 0.3647]))]

In [20]:
#Ex 3.5 
# 1: Randomly selects n users
fraction_of_users = 0.5
df_sampleUsers = df_userLibrary.sample(withReplacement=False, fraction=fraction_of_users)


In [21]:
df_sampleUsers.show()

+--------------------+--------------------+
|        user_hash_id|        user_library|
+--------------------+--------------------+
|d1d41a15201915503...|[6610569, 6493797...|
|f2f77383828ea6d39...|[943458, 238121, ...|
|9c883d02115400f7b...|[3509971, 3509965...|
|cf9c7f356092c34be...|             [90558]|
|0f5cbb39410a9278f...|           [9344598]|
|d85f7d83f27b3f533...|[7610843, 3633347...|
|586c867a0688250ac...|[464760, 466011, ...|
|589b870a611c25fa9...|[1283233, 1305474...|
|90f1a3e6fcdbf9bc5...|[115945, 11733005...|
|3b715ebaf1f8f81a1...|[4119394, 3378798...|
|0ad6516296d95068c...|[2734645, 1218426...|
|26b170a77a1a910b3...|[666773, 299199, ...|
|ed571b13a83199c9c...|[11056916, 12799962]|
|e17a1c14ffca94104...|[607999, 758852, ...|
|7c3219ff9046172ce...|[1089965, 1089982...|
|8a1d54402eddebee0...|[3408594, 3408584...|
|51656fa1c9a7e0412...|[927126, 967059, ...|
|bbcd9dae3160ddcb9...|[898519, 6929229,...|
|0fe2197fba44d2f76...|[2679179, 3251044...|
|8d29b0f1541d8b46a...|[2324796, 

In [22]:
#Ex3.5
#2 Divide sampled data into raining set and test set for each user
import math
import random

def divideList(masterList,trainingSetFraction):
    size_masterList = len(masterList)
    size_TrainingList = int(math.ceil(trainingSetFraction * size_masterList))
    trainingList = masterList[:size_TrainingList]
    testList = masterList[size_TrainingList:]
    return [trainingList,testList]

df_divide_train_test_data = df_sampleUsers.rdd.map(lambda x: (x[0],divideList(x[1],0.8))).toDF()
df_divide_train_test_data = df_divide_train_test_data.select(df_divide_train_test_data._1.alias("user_hash_id"),df_divide_train_test_data._2[0].alias("training_data"),df_divide_train_test_data._2[1].alias("test_data"))

df_training_data = df_divide_train_test_data.selectExpr("user_hash_id","training_data")
df_test_data = df_divide_train_test_data.selectExpr("user_hash_id","test_data")


In [23]:
df_training_data.show()

+--------------------+--------------------+
|        user_hash_id|       training_data|
+--------------------+--------------------+
|d1d41a15201915503...|[6610569, 6493797...|
|f2f77383828ea6d39...|[943458, 238121, ...|
|9c883d02115400f7b...|[3509971, 3509965...|
|cf9c7f356092c34be...|             [90558]|
|0f5cbb39410a9278f...|           [9344598]|
|d85f7d83f27b3f533...|[7610843, 3633347...|
|586c867a0688250ac...|[464760, 466011, ...|
|589b870a611c25fa9...|[1283233, 1305474...|
|90f1a3e6fcdbf9bc5...|[115945, 11733005...|
|3b715ebaf1f8f81a1...|[4119394, 3378798...|
|0ad6516296d95068c...|[2734645, 1218426...|
|26b170a77a1a910b3...|[666773, 299199, ...|
|ed571b13a83199c9c...|[11056916, 12799962]|
|e17a1c14ffca94104...|[607999, 758852, ...|
|7c3219ff9046172ce...|[1089965, 1089982...|
|8a1d54402eddebee0...|[3408594, 3408584...|
|51656fa1c9a7e0412...|[927126, 967059, ...|
|bbcd9dae3160ddcb9...|[898519, 6929229,...|
|0fe2197fba44d2f76...|[2679179, 3251044...|
|8d29b0f1541d8b46a...|[2324796, 

In [24]:
df_test_data.show()

+--------------------+--------------------+
|        user_hash_id|           test_data|
+--------------------+--------------------+
|d1d41a15201915503...|[7465494, 7329626...|
|f2f77383828ea6d39...|                  []|
|9c883d02115400f7b...|           [3878624]|
|cf9c7f356092c34be...|                  []|
|0f5cbb39410a9278f...|                  []|
|d85f7d83f27b3f533...|[3585, 1458475, 1...|
|586c867a0688250ac...|                  []|
|589b870a611c25fa9...|            [967275]|
|90f1a3e6fcdbf9bc5...|           [8310458]|
|3b715ebaf1f8f81a1...|[3399487, 3150261...|
|0ad6516296d95068c...|   [179911, 1127804]|
|26b170a77a1a910b3...|                  []|
|ed571b13a83199c9c...|                  []|
|e17a1c14ffca94104...|                  []|
|7c3219ff9046172ce...|[2301235, 1089946...|
|8a1d54402eddebee0...|                  []|
|51656fa1c9a7e0412...|[1118827, 5923518...|
|bbcd9dae3160ddcb9...|  [7151534, 3463357]|
|0fe2197fba44d2f76...|           [2868588]|
|8d29b0f1541d8b46a...|          

In [25]:
#Ex 3.5

#Creating User profiles for Training data set
df_training_data_explode = df_training_data.selectExpr("user_hash_id","explode(training_data) AS training_data_paper_id")

#a- user profile using sampled users data over Tf-Idf vector

df_training_data_TfIdf = df_training_data_explode.join(df_rescaledCleanedData,df_training_data_explode.training_data_paper_id == df_rescaledCleanedData.paper_id, how="inner").select(df_training_data_explode.user_hash_id,df_training_data_explode.training_data_paper_id,df_rescaledCleanedData.tf_idf_vector)
df_training_data_userprofile_tfidf = df_training_data_TfIdf.selectExpr("user_hash_id","tf_idf_vector").rdd.reduceByKey(lambda x,y: addSparseVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_tf_idf_vector")

In [26]:
df_training_data_userprofile_tfidf.take(1)

[Row(user_hash_id='f0fe2969ce44deace9595fabd7c12bdf', sum_tf_idf_vector=SparseVector(1000, {1: 9.3148, 2: 23.2806, 8: 4.6505, 9: 9.3009, 10: 9.2996, 11: 46.4919, 12: 4.6486, 13: 46.4668, 14: 18.5842, 15: 9.2896, 16: 23.2209, 17: 4.6379, 18: 4.6367, 20: 4.633, 21: 23.1557, 22: 9.261, 23: 4.6299, 24: 18.5196, 27: 4.6281, 28: 4.6268, 30: 46.2438, 31: 9.2475, 32: 9.2378, 33: 18.4585, 34: 18.4512, 35: 4.6122, 37: 9.2171, 38: 4.6086, 40: 46.0556, 41: 4.6044, 42: 9.2087, 44: 4.602, 45: 41.4069, 47: 18.3839, 48: 4.596, 49: 50.5297, 50: 9.1825, 51: 4.5912, 53: 4.5889, 54: 18.3531, 55: 18.3413, 56: 59.5786, 57: 9.1601, 58: 9.1589, 59: 13.7348, 60: 13.7331, 61: 9.1519, 62: 9.1495, 64: 4.5719, 65: 9.1414, 66: 13.7069, 68: 13.693, 69: 4.562, 71: 4.5574, 72: 18.2228, 73: 18.2137, 76: 4.5517, 78: 22.75, 80: 4.5494, 81: 4.5489, 82: 31.8381, 83: 4.5477, 85: 54.5524, 87: 4.5415, 88: 27.2491, 89: 9.0819, 92: 9.0796, 95: 4.5353, 96: 4.5331, 99: 13.5959, 102: 31.6692, 103: 4.5236, 104: 9.0472, 105: 9.045, 

In [27]:
#b)- User profile using sampled users data over LDA vector

df_training_data_LDA = df_training_data_explode.join(df_lda_paper_topic_model,df_training_data_explode.training_data_paper_id == df_lda_paper_topic_model.id, how="inner").select(df_training_data_explode.user_hash_id,df_training_data_explode.training_data_paper_id,df_lda_paper_topic_model.topicDistribution)
def addLDAPaperTopicVectors(v1,v2):
    return (v1+v2)
    
df_training_data_userprofile_lda = df_training_data_LDA.selectExpr("user_hash_id","topicDistribution").rdd.reduceByKey(lambda x,y: addLDAPaperTopicVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_lda_vector")

In [28]:
df_training_data_userprofile_lda.take(1)

[Row(user_hash_id='f0fe2969ce44deace9595fabd7c12bdf', sum_lda_vector=DenseVector([8.2345, 8.2489, 5.7265, 4.4824, 3.654, 2.924, 3.2137, 12.2184, 2.3684, 8.9382, 9.0248, 1.6631, 23.8189, 5.6961, 1.843, 17.5763, 2.4335, 7.6437, 5.256, 3.4809, 28.3198, 5.2124, 3.634, 3.107, 8.8775, 35.1362, 2.7627, 4.9445, 5.115, 2.829, 20.41, 26.3352, 3.1768, 4.9443, 2.825, 6.6206, 12.1022, 1.5842, 24.7411, 3.8774]))]