In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
import csv
from collections import Counter
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import countDistinct
import time

In [2]:
sc = SparkContext()
spark = SparkSession.builder.appName('Experiment1').getOrCreate()

In [3]:
## Ex 1.2
start_time = time.time()

#UserLibrary RDD
userLibraryRdd = sc.textFile("Datasets/users_libraries.txt")
userLibraryRdd = userLibraryRdd.map(lambda line: line.split(";")).map(lambda line: (line[0],list(map(int,line[1].split(",")))))

#print("UserLibrary RDD Start...")
#userLibraryRdd.take(10)
#print("UserLibrary RDD End...")

#PaperCsv RDD
def processPaperCsv(line):
    paperCsv = csv.reader([line.replace("\0", "")], delimiter=',', quoting=csv.QUOTE_MINIMAL)
    paperCsvList = next(paperCsv)
    return paperCsvList[0], paperCsvList[14]

paperCsvRdd = sc.textFile("Datasets/papers.csv")
paperCsvRdd = paperCsvRdd.map(processPaperCsv).filter(lambda x: (x[1] != "" and x[1] != " ")).map(lambda x: (int(x[0]),x[1].split(" ")))

#print("Paper Csv RDD Start...")
#paperCsvRdd.take(10)
#print("Paper Csv RDD End...")

#Stopword Broadcast
stopWords = sc.textFile("Datasets/stopwords_en.txt")
stopWordsBroadcast = sc.broadcast(stopWords.collect())

#print("Stop words Start...")
#stopWordsBroadcast.value
#print("Stop words End...")

end_time = time.time()

print("Execution Time taken(in seconds) to create RDD's:", end_time-start_time)

Execution Time taken(in seconds) to create RDD's: 2.0260565280914307


In [4]:
##Ex1.3
start_time = time.time()

#Removing stopWords
def removeStopWords(wordList):
    abstractwordsList = wordList.copy()
    for a in wordList:
        if ((a in stopWordsBroadcast.value) or a == "" or a == " "):
            abstractwordsList.remove(a)
    return abstractwordsList

userLibraryJoinPaperRdd = userLibraryRdd.flatMapValues(lambda x: x).map(lambda x: (x[1],x[0])).join(paperCsvRdd)
userLibraryJoinPaperRdd = userLibraryJoinPaperRdd.map(lambda x: (x[1][0],x[1][1]))
userLibraryJoinPaperRdd = userLibraryJoinPaperRdd.flatMapValues(lambda x:x).groupByKey().mapValues(list)
userLibraryJoinPaperWithoutStopWordsRdd = userLibraryJoinPaperRdd.mapValues(removeStopWords)

#Finding top 10 most frequent words
def findTopMostFrequentWords(x):
    CounterList = Counter(x)
    topTenMostFrequentWordWithCount = CounterList.most_common(10)
    topTenMostFrequentWord = [word for word, word_count in topTenMostFrequentWordWithCount]
    return topTenMostFrequentWord

frequentlyOccuringWordList = userLibraryJoinPaperWithoutStopWordsRdd.mapValues(findTopMostFrequentWords)

#wrtiting data to file
def CreateCsvLine(data):
    csvLineData = data[0] + "," + (','.join(str(d) for d in data[1]))
    return csvLineData

frequentlyOccuringWordListFile = frequentlyOccuringWordList.map(CreateCsvLine)
frequentlyOccuringWordListFile.saveAsTextFile("Outputs/Top10WordsForEachUser_RDD")

#print("Top 10 most frequent words of each user_hash_id Start...")
#frequentlyOccuringWordListFile.take(10)
#print("Top 10 most frequent words of each user_hash_id End...")

end_time = time.time()

print("Execution Time taken(in seconds) for Joining collections and writing to file:", end_time-start_time)

Execution Time taken(in seconds) for Joining collections and writing to file: 3594.6526041030884


In [5]:
#Ex1.4
start_time = time.time()

#a
userLibrayFMVRdd =userLibraryRdd.flatMapValues(lambda x:x)

noOfDistinctUsers = userLibrayFMVRdd.keys().distinct().count()
noOfDistinctItems = userLibrayFMVRdd.values().distinct().count()
noOfRatings = userLibrayFMVRdd.values().count()

#b,c,d,e
ratingsList = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).map(lambda x: x[1])

minNoOfRatingUserHasGiven = ratingsList.min()
#minNoOfRatingUserHasGiven = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).sortBy(lambda x: x[1], ascending=True).map(lambda x: x[1]).first()
maxNoOfRatingUserHasGiven = ratingsList.max()
#maxNoOfRatingUserHasGiven = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).sortBy(lambda x: x[1], ascending=False).map(lambda x: x[1]).first()
avgNumberOfRatingUserGave = noOfRatings/noOfDistinctUsers
standardDeviationOfRating = ratingsList.stdev()

#f,g,h,i
userLibraryReduceByPaperIdRdd = userLibrayFMVRdd.map(lambda x: (x[1],1)).reduceByKey(lambda x,y: x+y)

ratingsListByPaperId = userLibraryReduceByPaperIdRdd.map(lambda x: x[1])
minNoOfRatingItemHasReceived = ratingsListByPaperId.min()
maxNoOfRatingItemHasReceived = ratingsListByPaperId.max()
avgNumberOfRatingOfItems = noOfRatings/noOfDistinctItems
standardDeviationOfRItem = ratingsListByPaperId.stdev()

print("Number of (distinct) user:" ,noOfDistinctUsers)
print("Number of (distinct) items:" ,noOfDistinctItems)
print("Number of ratings:" ,noOfRatings)
print("Min number of ratings a user has given:",minNoOfRatingUserHasGiven)
print("Max number of ratings a user has given:",maxNoOfRatingUserHasGiven)
print("Average number of ratings of users:",avgNumberOfRatingUserGave)
print("Standard deviation for ratings of users:",standardDeviationOfRating)
print("Min number of ratings an item has received:",minNoOfRatingItemHasReceived)
print("Max number of ratings an item has received:",maxNoOfRatingItemHasReceived)
print("Average number of ratings of items:",avgNumberOfRatingOfItems)
print("Standard deviation for ratings of items:",standardDeviationOfRItem)

end_time = time.time()

print("Execution Time taken(in seconds) for Basic Analysis for Recommender Systems:", end_time-start_time)

Number of (distinct) user: 28416
Number of (distinct) items: 172079
Number of ratings: 828481
Min number of ratings a user has given: 1
Max number of ratings a user has given: 1922
Average number of ratings of users: 29.155440596846848
Standard deviation for ratings of users: 81.1751761366871
Min number of ratings an item has received: 3
Max number of ratings an item has received: 924
Average number of ratings of items: 4.81453867119172
Standard deviation for ratings of items: 5.477802292314525
Execution Time taken(in seconds) for Basic Analysis for Recommender Systems: 9.11683988571167


In [6]:
#Ex1.5
start_time = time.time()

#UserLibrary dataframe
userLibrarySchema = StructType([
    StructField("user_hash_id",StringType(),False),
    StructField("user_library",StringType(),False)
])
df_userLibrary = spark.read.csv("Datasets/users_libraries.txt", sep = ";", header = False, schema = userLibrarySchema)
df_userLibrary = df_userLibrary.selectExpr("user_hash_id","split(user_library,',') AS user_library")

#print("UserLibrary Dataframe Start...")
#df_userLibrary.take(10)
#print("UserLibrary Dataframe End...")

#PaperCsv dataframe
papersCsvSchema = StructType([
    StructField("paper_id",StringType(),False),
    StructField("type",StringType(),False),
    StructField("journal",StringType(),False),
    StructField("book_title",StringType(),False),
    StructField("series",StringType(),False),
    StructField("publisher",StringType(),False),
    StructField("pages",StringType(),False),
    StructField("volume",StringType(),False),
    StructField("number",StringType(),False),
    StructField("year",StringType(),False),
    StructField("month",StringType(),False),
    StructField("postedat",StringType(),False),
    StructField("address",StringType(),False),
    StructField("title",StringType(),False),
    StructField("abstract",StringType(),False),
])
df_paperCsv = spark.read.csv("Datasets/papers.csv", sep = ",", header = False, schema = papersCsvSchema, quote = '"')
df_paperCsv = df_paperCsv.selectExpr("paper_id","split(replace(abstract,'\"',''),' ') AS abstract")
df_paperCsv = df_paperCsv.na.drop(subset=["abstract"])

#print("Paper Csv Dataframe Start...")
#df_paperCsv.take(10)
#print("Paper Csv Dataframe End...")

end_time = time.time()

print("Execution Time taken(in seconds) to create Dataframes:", end_time-start_time)

Execution Time taken(in seconds) to create Dataframes: 2.894200325012207


In [7]:
#Ex1.6
start_time = time.time()

#Explode UserLibrary
userLibraryExplode = df_userLibrary.select(df_userLibrary.user_hash_id,explode(df_userLibrary.user_library).alias("paper_id"))

#Join UserLibrary and PaperCsv
df_userLibraryJoinPaperCsv = df_paperCsv.join(userLibraryExplode,df_paperCsv.paper_id == userLibraryExplode.paper_id, how="inner").select(userLibraryExplode.user_hash_id,userLibraryExplode.paper_id,df_paperCsv.abstract)

#Removing stop words
df_userLibraryJoinPaperCsv = df_userLibraryJoinPaperCsv.select(df_userLibraryJoinPaperCsv.user_hash_id,explode(df_userLibraryJoinPaperCsv.abstract).alias("abstract"))
useless_words = ['',' ','"']
df_userLibraryJoinPaperCsvWithoutStopWords = df_userLibraryJoinPaperCsv[~df_userLibraryJoinPaperCsv["abstract"].isin(stopWordsBroadcast.value)]
df_userLibraryJoinPaperCsvWithoutStopWords = df_userLibraryJoinPaperCsvWithoutStopWords[~df_userLibraryJoinPaperCsvWithoutStopWords["abstract"].isin(useless_words)]

#Finding top 10 most frequent words
df_userLibraryJoinPaperCsvWithoutStopWordsCount = df_userLibraryJoinPaperCsvWithoutStopWords.groupBy("user_hash_id","abstract").count().withColumnRenamed("count", "word_count")
userWords_window = Window.partitionBy(df_userLibraryJoinPaperCsvWithoutStopWordsCount.user_hash_id).orderBy(col("word_count").desc())
df_userLibraryJoinPaperCsvWithoutStopWordsRank = df_userLibraryJoinPaperCsvWithoutStopWordsCount.withColumn("word_rank",rank().over(userWords_window))
df_topFrequentWordsPerUser = df_userLibraryJoinPaperCsvWithoutStopWordsRank.filter(df_userLibraryJoinPaperCsvWithoutStopWordsRank["word_rank"]<11)
df_groupedTop10FrequentWordsPerUser = df_topFrequentWordsPerUser.groupBy("user_hash_id").agg(collect_list("abstract")).withColumnRenamed("collect_list(abstract)", "abstract_word_list")

#writing top 10 most frequent words of each user to file
df_groupedTop10FrequentWordsPerUser.write.save("Outputs/Top10WordsForEachUser_DF")

#print("Top 10 most frequent words of each user_hash_id Start...")
#df_groupedTop10FrequentWordsPerUser.take(10)
#print("Top 10 most frequent words of each user_hash_id End...")

end_time = time.time()

print("Execution Time taken(in seconds) for Joining collections and writing to file", end_time-start_time)

Execution Time taken(in seconds) for Joining collections and writing to file 319.82534885406494


In [8]:
#Ex1.6
start_time = time.time()

#a
noOfDistinctUsers_df = str(userLibraryExplode.select(countDistinct("user_hash_id")).collect()[0][0])
noOfDistinctItems_df = str(userLibraryExplode.select(countDistinct("paper_id")).collect()[0][0])
noOfRatings_df = userLibraryExplode.count()

#b,c,d,e
ratingsList_df = userLibraryExplode.groupBy("user_hash_id").count().withColumnRenamed("count","no_of_items")
ratingsList_df = ratingsList_df.describe("no_of_items")

minNoOfRatingUserHasGiven = str(ratingsList_df.filter("summary == 'min'").collect()[0][1])
maxNoOfRatingUserHasGiven = str(ratingsList_df.filter("summary == 'max'").collect()[0][1])
avgNumberOfRatingUserGave = int(noOfRatings_df)/int(noOfDistinctUsers_df)
standardDeviationOfRating = str(ratingsList_df.filter("summary == 'stddev'").collect()[0][1])

#f,g,h,i
ratingsListByPaperId_df = userLibraryExplode.groupBy("paper_id").count().withColumnRenamed("count","no_of_ratings")
ratingsListByPaperId_df = ratingsListByPaperId_df.describe("no_of_ratings")

minNoOfRatingItemHasReceived_df = str(ratingsListByPaperId_df.filter("summary == 'min'").collect()[0][1])
maxNoOfRatingItemHasReceived_df = str(ratingsListByPaperId_df.filter("summary == 'max'").collect()[0][1])
avgNumberOfRatingOfItems_df = int(noOfRatings_df)/int(noOfDistinctItems_df)
standardDeviationOfRItem_df = str(ratingsListByPaperId_df.filter("summary == 'stddev'").collect()[0][1])

print("Number of (distinct) user:" ,noOfDistinctUsers_df)
print("Number of (distinct) items:" ,noOfDistinctItems_df)
print("Number of ratings:" ,noOfRatings_df)
print("Min number of ratings a user has given:",minNoOfRatingUserHasGiven)
print("Max number of ratings a user has given:",maxNoOfRatingUserHasGiven)
print("Average number of ratings of users:",avgNumberOfRatingUserGave)
print("Standard deviation for ratings of users:",standardDeviationOfRating)
print("Min number of ratings an item has received:",minNoOfRatingItemHasReceived_df)
print("Max number of ratings an item has received:",maxNoOfRatingItemHasReceived_df)
print("Average number of ratings of items:",avgNumberOfRatingOfItems_df)
print("Standard deviation for ratings of items:",standardDeviationOfRItem_df)

end_time = time.time()

print("Execution Time taken(in seconds) for Basic Analysis for Recommender Systems:", end_time-start_time)

Number of (distinct) user: 28416
Number of (distinct) items: 172079
Number of ratings: 828481
Min number of ratings a user has given: 1
Max number of ratings a user has given: 1922
Average number of ratings of users: 29.155440596846848
Standard deviation for ratings of users: 81.17660451011605
Min number of ratings an item has received: 3
Max number of ratings an item has received: 924
Average number of ratings of items: 4.81453867119172
Standard deviation for ratings of items: 5.477818208917284
Execution Time taken(in seconds) for Basic Analysis for Recommender Systems: 15.048475742340088
