In [36]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
import csv
from collections import Counter
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [2]:
sc = SparkContext()
spark = SparkSession.builder.appName('Experiment1').getOrCreate()

In [3]:
userLibraryRdd = sc.textFile("Datasets/users_libraries.txt")
userLibraryRdd = userLibraryRdd.map(lambda line: line.split(";")).map(lambda line: (line[0],list(map(int,line[1].split(",")))))

In [4]:
def processPaperCsv(line):
    paperCsv = csv.reader([line.replace("\0", "")], delimiter=',', quoting=csv.QUOTE_MINIMAL)
    paperCsvList = next(paperCsv)
    return paperCsvList[0], paperCsvList[14]

paperCsvRdd = sc.textFile("Datasets/papers.csv")
paperCsvRdd = paperCsvRdd.map(processPaperCsv).filter(lambda x: (x[1] != "" and x[1] != " ")).map(lambda x: (int(x[0]),x[1].split(" ")))

In [5]:
stopWords = sc.textFile("Datasets/stopwords_en.txt")
stopWordsBroadcast = sc.broadcast(stopWords.collect())

In [6]:
def removeStopWords(wordList):
    abstractwordsList = wordList.copy()
    for a in wordList:
        if ((a in stopWordsBroadcast.value) or a == "" or a == " "):
            abstractwordsList.remove(a)
    return abstractwordsList

userLibraryJoinPaperRdd = userLibraryRdd.flatMapValues(lambda x: x).map(lambda x: (x[1],x[0])).join(paperCsvRdd)
userLibraryJoinPaperRdd = userLibraryJoinPaperRdd.map(lambda x: (x[1][0],x[1][1]))
userLibraryJoinPaperRdd = userLibraryJoinPaperRdd.flatMapValues(lambda x:x).groupByKey().mapValues(list)
userLibraryJoinPaperWithoutStopWordsRdd = userLibraryJoinPaperRdd.mapValues(removeStopWords)

In [7]:
def findTopMostFrequentWords(x):
    CounterList = Counter(x)
    topTenMostFrequentWordWithCount = CounterList.most_common(10)
    topTenMostFrequentWord = [word for word, word_count in topTenMostFrequentWordWithCount]
    return topTenMostFrequentWord

frequentlyOccuringWordList = userLibraryJoinPaperWithoutStopWordsRdd.mapValues(findTopMostFrequentWords)

In [None]:
#frequentlyOccuringWordList.take(1)

In [None]:
def CreateCsvLine(data):
    csvLineData = data[0] + "," + (','.join(str(d) for d in data[1]))
    return csvLineData

frequentlyOccuringWordListFile = frequentlyOccuringWordList.map(CreateCsvLine)
frequentlyOccuringWordListFile.saveAsTextFile("Datasets/Top10WordsForEachUser_RDD")

In [None]:
#frequentlyOccuringWordListFile.take(4)

In [8]:
#Ex1.4

userLibrayFMVRdd =userLibraryRdd.flatMapValues(lambda x:x)

#a
noOfDistinctUsers = userLibrayFMVRdd.keys().distinct().count()
noOfDistinctItems = userLibrayFMVRdd.values().distinct().count()
noOfRatings = userLibrayFMVRdd.values().count()

print("Number of (distinct) user:" ,noOfDistinctUsers)
print("Number of (distinct) items:" ,noOfDistinctItems)
print("Number of ratings:" ,noOfRatings)

Number of (distinct) user: 28416
Number of (distinct) items: 172079
Number of ratings: 828481


In [9]:
#b,c,d,e

ratingsList = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).map(lambda x: x[1])
minNoOfRatingUserHasGiven = ratingsList.min()
#minNoOfRatingUserHasGiven = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).sortBy(lambda x: x[1], ascending=True).map(lambda x: x[1]).first()
maxNoOfRatingUserHasGiven = ratingsList.max()
#maxNoOfRatingUserHasGiven = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).sortBy(lambda x: x[1], ascending=False).map(lambda x: x[1]).first()
avgNumberOfRatingUserGave = noOfRatings/noOfDistinctUsers
standardDeviationOfRating = ratingsList.stdev()

In [10]:
print("Min number of ratings a user has given:",minNoOfRatingUserHasGiven)
print("Max number of ratings a user has given:",maxNoOfRatingUserHasGiven)
print("Average number of ratings of users:",avgNumberOfRatingUserGave)
print("Standard deviation for ratings of users:",standardDeviationOfRating)

Min number of ratings a user has given: 1
Max number of ratings a user has given: 1922
Average number of ratings of users: 29.155440596846848
Standard deviation for ratings of users: 81.1751761366871


In [11]:
userLibraryReduceByPaperIdRdd = userLibrayFMVRdd.map(lambda x: (x[1],1)).reduceByKey(lambda x,y: x+y)

In [12]:
#f,g,h,i

ratingsListByPaperId = userLibraryReduceByPaperIdRdd.map(lambda x: x[1])
minNoOfRatingItemHasReceived = ratingsListByPaperId.min()
maxNoOfRatingItemHasReceived = ratingsListByPaperId.max()
avgNumberOfRatingOfItems = noOfRatings/noOfDistinctItems
standardDeviationOfRItem = ratingsListByPaperId.stdev()

In [13]:
print("Min number of ratings an item has received:",minNoOfRatingItemHasReceived)
print("Max number of ratings an item has received:",maxNoOfRatingItemHasReceived)
print("Average number of ratings of items:",avgNumberOfRatingOfItems)
print("Standard deviation for ratings of items:",standardDeviationOfRItem)

Min number of ratings an item has received: 3
Max number of ratings an item has received: 924
Average number of ratings of items: 4.81453867119172
Standard deviation for ratings of items: 5.477802292314525


In [14]:
userLibrarySchema = StructType([
    StructField("user_hash_id",StringType(),False),
    StructField("user_library",StringType(),False)
])
df_userLibrary = spark.read.csv("Datasets/users_libraries.txt", sep = ";", header = False, schema = userLibrarySchema)
df_userLibrary = df_userLibrary.selectExpr("user_hash_id","split(user_library,',') AS user_library")

In [15]:
#df_userLibrary.show()

In [16]:
papersCsvSchema = StructType([
    StructField("paper_id",StringType(),False),
    StructField("type",StringType(),False),
    StructField("journal",StringType(),False),
    StructField("book_title",StringType(),False),
    StructField("series",StringType(),False),
    StructField("publisher",StringType(),False),
    StructField("pages",StringType(),False),
    StructField("volume",StringType(),False),
    StructField("number",StringType(),False),
    StructField("year",StringType(),False),
    StructField("month",StringType(),False),
    StructField("postedat",StringType(),False),
    StructField("address",StringType(),False),
    StructField("title",StringType(),False),
    StructField("abstract",StringType(),False),
])
df_paperCsv = spark.read.csv("Datasets/papers.csv", sep = ",", header = False, schema = papersCsvSchema, quote = '"')
df_paperCsv = df_paperCsv.selectExpr("paper_id","split(replace(abstract,'\"',''),' ') AS abstract")
df_paperCsv = df_paperCsv.na.drop(subset=["abstract"])

In [17]:
#df_paperCsv.show()

In [18]:
userLibraryExplode = df_userLibrary.select(df_userLibrary.user_hash_id,explode(df_userLibrary.user_library).alias("paper_id"))
df_userLibraryJoinPaperCsv = df_paperCsv.join(userLibraryExplode,df_paperCsv.paper_id == userLibraryExplode.paper_id, how="inner").select(userLibraryExplode.user_hash_id,userLibraryExplode.paper_id,df_paperCsv.abstract)


In [19]:
df_userLibraryJoinPaperCsv = df_userLibraryJoinPaperCsv.select(df_userLibraryJoinPaperCsv.user_hash_id,explode(df_userLibraryJoinPaperCsv.abstract).alias("abstract"))

In [72]:
useless_words = ['',' ','"']
df_userLibraryJoinPaperCsvWithoutStopWords = df_userLibraryJoinPaperCsv[~df_userLibraryJoinPaperCsv["abstract"].isin(stopWordsBroadcast.value)]
df_userLibraryJoinPaperCsvWithoutStopWords = df_userLibraryJoinPaperCsvWithoutStopWords[~df_userLibraryJoinPaperCsvWithoutStopWords["abstract"].isin(useless_words)]

In [73]:
df_userLibraryJoinPaperCsvWithoutStopWordsCount = df_userLibraryJoinPaperCsvWithoutStopWords.groupBy("user_hash_id","abstract").count().withColumnRenamed("count", "word_count")

In [74]:
userWords_window = Window.partitionBy(df_userLibraryJoinPaperCsvWithoutStopWordsCount.user_hash_id).orderBy(col("word_count").desc())


In [75]:
df_userLibraryJoinPaperCsvWithoutStopWordsRank = df_userLibraryJoinPaperCsvWithoutStopWordsCount.withColumn("word_rank",rank().over(userWords_window))

In [76]:
df_topFrequentWordsPerUser = df_userLibraryJoinPaperCsvWithoutStopWordsRank.filter(df_userLibraryJoinPaperCsvWithoutStopWordsRank["word_rank"]<11)

In [78]:
df_groupedTop10FrequentWordsPerUser = df_topFrequentWordsPerUser.groupBy("user_hash_id").agg(collect_list("abstract")).withColumnRenamed("collect_list(abstract)", "abstract_word_list")

In [None]:
df_groupedTop10FrequentWordsPerUser.write.save("Datasets/Top10WordsForEachUser_DF")