In [1]:
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark import SparkFiles
import pandas as pd
from pyspark.sql.functions import mean

In [2]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

Load ratings

In [3]:
from pyspark import SparkFiles
spark = init_spark()
sp = None
import pandas as pd
chunksize = 100000

sp = spark.read.csv("ratings.csv", header=True)



#for chunk in pd.read_csv("ratings.csv", chunksize=chunksize):
#    sp = spark.createDataFrame(chunk)
#    break

In [4]:
print("number of rating rows:",sp.count())

number of rating rows: 5976479


In [5]:
ratings = sp.groupBy('rating').count()
ratings.show()

+------+-------+
|rating|  count|
+------+-------+
|     3|1370916|
|     5|1983093|
|     1| 124195|
|     4|2139018|
|     2| 359257|
+------+-------+



get unique users

In [6]:
users = sp.select('user_id').distinct().collect()
print("Number of Users for considered ratings: ",len(users))

Number of Users for considered ratings:  53424


get books info

In [7]:
bk = spark.read.csv("books.csv", header=True)
print("Total Number of Books",bk.count())
bk1 = bk.select('book_id','authors','original_publication_year','original_title','language_code')
bk2 = bk.select('average_rating','ratings_count','ratings_1','ratings_2','ratings_3','ratings_4','ratings_5')
bk1.show(1)
bk2.show(1)
#Need for trim null data

Total Number of Books 10000
+-------+---------------+-------------------------+----------------+-------------+
|book_id|        authors|original_publication_year|  original_title|language_code|
+-------+---------------+-------------------------+----------------+-------------+
|      1|Suzanne Collins|                   2008.0|The Hunger Games|          eng|
+-------+---------------+-------------------------+----------------+-------------+
only showing top 1 row

+--------------+-------------+---------+---------+---------+---------+---------+
|average_rating|ratings_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|
+--------------+-------------+---------+---------+---------+---------+---------+
|          4.34|      4780653|    66715|   127936|   560092|  1481305|  2706317|
+--------------+-------------+---------+---------+---------+---------+---------+
only showing top 1 row



get unique books from ratings

In [8]:
books = sp.select('book_id').distinct().collect()
print("Number of Books for considered ratings: ",len(books))

Number of Books for considered ratings:  10000


In [9]:
tags = spark.read.csv("book_tags.csv", header=True)
tags = tags.orderBy('count', ascending =False)
print("number of book-tag pairs: ",tags.count())
tags.show(5)
tags = tags[tags['count']>=10]
print("number of book-tag pairs with more than 10 votes: ",tags.count())


tag = tags.groupBy('tag_id').count()
print("Number of tags: ",tag.count())
tag = tag.orderBy('count',ascending=False)
print("Most used tags")
tag.show(5)
tag = tag[tag['count']>=500]
print("Tags that has been used more than 100 times: (Ignore the other tags)")
print(tag.count())
#print(tag.select(mean("count")).collect())

number of book-tag pairs:  999912
+-----------------+------+-----+
|goodreads_book_id|tag_id|count|
+-----------------+------+-----+
|           109515| 30574| 9998|
|           153136| 30574| 9995|
|         18584855| 30574|99921|
|         13536860|  1642|  999|
|         24480276|  8717|  999|
+-----------------+------+-----+
only showing top 5 rows

number of book-tag pairs with more than 10 votes:  650077
Number of tags:  13883
Most used tags
+------+-----+
|tag_id|count|
+------+-----+
| 30574| 9971|
|  8717| 9689|
| 11557| 9644|
| 22743| 9453|
|  5207| 9134|
+------+-----+
only showing top 5 rows

Tags that has been used more than 100 times: (Ignore the other tags)
235


In [10]:
bk_merge = bk.join(sp, on=['book_id'], how='inner')

In [11]:
bk_ = bk_merge.select('book_id','authors','original_publication_year','original_title','language_code','average_rating',
                           'ratings_count','ratings_1','ratings_2','ratings_3','ratings_4','ratings_5')
print(bk_.count())
    
bk_ = bk_.distinct()
print("book_id intersection:",bk_.count())
print("We have ratings data for all 10 000 books")

5976479
book_id intersection: 10000
We have ratings data for all 10 000 books


In [12]:
year = bk_.groupBy('original_publication_year').count()
year = year.orderBy('original_publication_year')
print("Oldest books;")
year.show(10)
year = year.orderBy('original_publication_year',ascending=False)
print("Newest books;")
year.show(10)

year = year.orderBy('count',ascending = False)
year.show(10)
print("Unique book publication years:", year.count())

Oldest books;
+-------------------------+-----+
|original_publication_year|count|
+-------------------------+-----+
|                     null|   21|
|                    -17.0|    1|
|                  -1750.0|    1|
|                   -300.0|    1|
|                   -330.0|    1|
|                   -335.0|    1|
|                   -350.0|    2|
|                   -380.0|    1|
|                   -385.0|    2|
|                   -390.0|    1|
+-------------------------+-----+
only showing top 10 rows

Newest books;
+-------------------------+-----+
|original_publication_year|count|
+-------------------------+-----+
|                    975.0|    1|
|                    800.0|    1|
|                      8.0|    1|
|                    609.0|    1|
|                    397.0|    1|
|                   2017.0|   11|
|                   2016.0|  198|
|                   2015.0|  306|
|                   2014.0|  437|
|                   2013.0|  518|
+-------------------------+-

Books dataset needs to be cleaned..

In [13]:
train,test,validate = sp.randomSplit([0.8, 0.1, 0.1],2)
print("Train dataset size: ",train.count())
print("Test dataset size: ",test.count())
print("validation dataset size: ",validate.count())
train.show(10)
test.show(10)

Train dataset size:  4780407
Test dataset size:  598004
validation dataset size:  598068
+-------+-------+------+
|user_id|book_id|rating|
+-------+-------+------+
|      1|     10|     4|
|      1|    103|     3|
|      1|   1041|     5|
|      1|     11|     5|
|      1|    119|     3|
|      1|     13|     4|
|      1|    136|     5|
|      1|    138|     2|
|      1|    148|     3|
|      1|    150|     3|
+-------+-------+------+
only showing top 10 rows

+-------+-------+------+
|user_id|book_id|rating|
+-------+-------+------+
|      1|    111|     3|
|      1|   2002|     5|
|      1|   4614|     1|
|      1|    492|     2|
|      1|     70|     5|
|     10|   3638|     3|
|     10|   4363|     4|
|     10|     63|     4|
|    100|     10|     3|
|    100|     11|     3|
+-------+-------+------+
only showing top 10 rows

