In [20]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
import csv
from collections import Counter
from pyspark.sql.types import *

In [2]:
sc = SparkContext()
spark = SparkSession.builder.appName('Experiment1').getOrCreate()

In [3]:
userLibraryRdd = sc.textFile("Datasets/users_libraries.txt")
userLibraryRdd = userLibraryRdd.map(lambda line: line.split(";")).map(lambda line: (line[0],list(map(int,line[1].split(",")))))

In [4]:
def processPaperCsv(line):
    paperCsv = csv.reader([line.replace("\0", "")], delimiter=',', quoting=csv.QUOTE_MINIMAL)
    paperCsvList = next(paperCsv)
    return paperCsvList[0], paperCsvList[14]

paperCsvRdd = sc.textFile("Datasets/papers.csv")
paperCsvRdd = paperCsvRdd.map(processPaperCsv).filter(lambda x: (x[1] != "" and x[1] != " ")).map(lambda x: (int(x[0]),x[1].split(" ")))

In [5]:
stopWords = sc.textFile("Datasets/stopwords_en.txt")
stopWordsBroadcast = sc.broadcast(stopWords.collect())

In [6]:
def removeStopWords(wordList):
    abstractwordsList = wordList.copy()
    for a in wordList:
        if ((a in stopWordsBroadcast.value) or a == "" or a == " "):
            abstractwordsList.remove(a)
    return abstractwordsList

userLibraryJoinPaperRdd = userLibraryRdd.flatMapValues(lambda x: x).map(lambda x: (x[1],x[0])).join(paperCsvRdd)
userLibraryJoinPaperRdd = userLibraryJoinPaperRdd.map(lambda x: (x[1][0],x[1][1]))
userLibraryJoinPaperRdd = userLibraryJoinPaperRdd.flatMapValues(lambda x:x).groupByKey().mapValues(list)
userLibraryJoinPaperWithoutStopWordsRdd = userLibraryJoinPaperRdd.mapValues(removeStopWords)

In [7]:
def findTopMostFrequentWords(x):
    CounterList = Counter(x)
    topTenMostFrequentWordWithCount = CounterList.most_common(10)
    topTenMostFrequentWord = [word for word, word_count in topTenMostFrequentWordWithCount]
    return topTenMostFrequentWord

frequentlyOccuringWordList = userLibraryJoinPaperWithoutStopWordsRdd.mapValues(findTopMostFrequentWords)

In [8]:
#frequentlyOccuringWordList.take(1)

In [9]:
def CreateCsvLine(data):
    csvLineData = data[0] + "," + (','.join(str(d) for d in data[1]))
    return csvLineData

frequentlyOccuringWordListFile = frequentlyOccuringWordList.map(CreateCsvLine)
frequentlyOccuringWordListFile.saveAsTextFile("Datasets/Top10MostFrequentWordsForEachUser")

In [10]:
#frequentlyOccuringWordListFile.take(4)

In [11]:
#Ex1.4

userLibrayFMVRdd =userLibraryRdd.flatMapValues(lambda x:x)

#a
noOfDistinctUsers = userLibrayFMVRdd.keys().distinct().count()
noOfDistinctItems = userLibrayFMVRdd.values().distinct().count()
noOfRatings = userLibrayFMVRdd.values().count()

print("Number of (distinct) user:" ,noOfDistinctUsers)
print("Number of (distinct) items:" ,noOfDistinctItems)
print("Number of ratings:" ,noOfRatings)

Number of (distinct) user: 28416
Number of (distinct) items: 172079
Number of ratings: 828481


In [12]:
#b,c,d,e

ratingsList = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).map(lambda x: x[1])
minNoOfRatingUserHasGiven = ratingsList.min()
#minNoOfRatingUserHasGiven = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).sortBy(lambda x: x[1], ascending=True).map(lambda x: x[1]).first()
maxNoOfRatingUserHasGiven = ratingsList.max()
#maxNoOfRatingUserHasGiven = userLibraryRdd.map(lambda x: (x[0],len(x[1]))).sortBy(lambda x: x[1], ascending=False).map(lambda x: x[1]).first()
avgNumberOfRatingUserGave = noOfRatings/noOfDistinctUsers
standardDeviationOfRating = ratingsList.stdev()

In [13]:
print("Min number of ratings a user has given:",minNoOfRatingUserHasGiven)
print("Max number of ratings a user has given:",maxNoOfRatingUserHasGiven)
print("Average number of ratings of users:",avgNumberOfRatingUserGave)
print("Standard deviation for ratings of users:",standardDeviationOfRating)

Min number of ratings a user has given: 1
Max number of ratings a user has given: 1922
Average number of ratings of users: 29.155440596846848
Standard deviation for ratings of users: 81.1751761366871


In [14]:
userLibraryReduceByPaperIdRdd = userLibrayFMVRdd.map(lambda x: (x[1],1)).reduceByKey(lambda x,y: x+y)

In [15]:
#f,g,h,i

ratingsListByPaperId = userLibraryReduceByPaperIdRdd.map(lambda x: x[1])
minNoOfRatingItemHasReceived = ratingsListByPaperId.min()
maxNoOfRatingItemHasReceived = ratingsListByPaperId.max()
avgNumberOfRatingOfItems = noOfRatings/noOfDistinctItems
standardDeviationOfRItem = ratingsListByPaperId.stdev()

In [16]:
print("Min number of ratings an item has received:",minNoOfRatingItemHasReceived)
print("Max number of ratings an item has received:",maxNoOfRatingItemHasReceived)
print("Average number of ratings of items:",avgNumberOfRatingOfItems)
print("Standard deviation for ratings of items:",standardDeviationOfRItem)

Min number of ratings an item has received: 3
Max number of ratings an item has received: 924
Average number of ratings of items: 4.81453867119172
Standard deviation for ratings of items: 5.477802292314525


In [26]:
userLibrarySchema = StructType([
    StructField("user_hash_id",StringType(),False),
    StructField("user_library",StringType(),False)
])
df_userLibrary = spark.read.csv("Datasets/users_libraries.txt", sep = ";", header = False, schema = userLibrarySchema)
df_userLibrary = df_userLibrary.selectExpr("user_hash_id","split(user_library,',') AS user_library")

In [27]:
df_userLibrary.show()

+--------------------+--------------------+
|        user_hash_id|        user_library|
+--------------------+--------------------+
|28d3f81251d94b097...|[3929762, 503574,...|
|d0c9aaa788153daea...|[2080631, 6343346...|
|f05bcffe7951de9e5...|[1158654, 478707,...|
|ca4f1ba4094011d9a...|            [278019]|
|d1d41a15201915503...|[6610569, 6493797...|
|f2f77383828ea6d39...|[943458, 238121, ...|
|9c883d02115400f7b...|[3509971, 3509965...|
|b656009a6efdc8b1a...|[771870, 181369, ...|
|cf9c7f356092c34be...|             [90558]|
|0f5cbb39410a9278f...|           [9344598]|
|d85f7d83f27b3f533...|[7610843, 3633347...|
|586c867a0688250ac...|[464760, 466011, ...|
|10fdfaf945d5c27ad...|           [2010550]|
|589b870a611c25fa9...|[1283233, 1305474...|
|90f1a3e6fcdbf9bc5...|[115945, 11733005...|
|7e070a9da96672e05...|           [1071959]|
|3b715ebaf1f8f81a1...|[4119394, 3378798...|
|488fb15e8c77f8054...|[1523301, 5281566...|
|3fdf355e59949c79d...|[7077220, 1289842...|
|c6b59086a0bbac141...|[2230995, 

In [38]:
papersCsvSchema = StructType([
    StructField("paper_id",StringType(),False),
    StructField("type",StringType(),False),
    StructField("journal",StringType(),False),
    StructField("book_title",StringType(),False),
    StructField("series",StringType(),False),
    StructField("publisher",StringType(),False),
    StructField("pages",StringType(),False),
    StructField("volume",StringType(),False),
    StructField("number",StringType(),False),
    StructField("year",StringType(),False),
    StructField("month",StringType(),False),
    StructField("postedat",StringType(),False),
    StructField("address",StringType(),False),
    StructField("title",StringType(),False),
    StructField("abstract",StringType(),False),
])
df_paperCsv = spark.read.csv("Datasets/papers.csv", sep = ",", header = False, schema = papersCsvSchema, quote = '"')
df_paperCsv = df_paperCsv.selectExpr("paper_id","split(replace(abstract,'\"',''),' ') AS abstract")
df_paperCsv = df_paperCsv.na.drop(subset=["abstract"])

In [42]:
df_paperCsv.explode("abstract")

AttributeError: 'DataFrame' object has no attribute 'explode'

In [40]:
df_paperCsv.filter("abstract")

AttributeError: 'DataFrame' object has no attribute 'map'