# Dado el dataset de rating de goodreads (1.1GB) https://www.kaggle.com/bahramjannesarr/goodreads-book-datasets-10m determinar:

### 1.- Rating promedio de todos los libros

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("GoodReads")\
    .getOrCreate()

In [2]:
# Imports necesarios
from os import listdir
from os.path import isfile, join

# Obtén la ruta relativa de los archivos del dataset.
mypath="books/"
onlyfiles = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))]

# Crea un DF vacío o no podrá concatenar.
booksDF=spark.createDataFrame([], StructType([]))

# Lee cada csv uno a uno y concatenalo con nuestro dataframe.
# Es importante hacerlo así en lugar de leer directamente la carpeta porque nuestros archivos tienen las columnas desordenadas
# Y algunos archivos tienen incluso columnas extra, resultando en datos desordenados porque todos intentan usar el mismo esquema.
for i in onlyfiles:
    df_aux = (spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("quote", "\"")
    .option("escape","\"")
    .option("multiLine","true").load(i))
    booksDF=booksDF.unionByName(df_aux, allowMissingColumns=True)

# Las columnas tienen un orden un poco extraño pero no importa.

In [3]:
# Sólo para comprobar que todo está bien.
booksDF.limit(5).toPandas()

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,PublishYear,Language,Authors,Rating,RatingDist2,RatingDist5,ISBN,RatingDist3,Description,Count of text reviews
0,1,Harry Potter and the Half-Blood Prince (Harry ...,1:9896,652,4:556485,total:2298124,16,9,Scholastic Inc.,28062,2006,eng,J.K. Rowling,4.57,2:25317,5:1546466,,3:159960,,
1,2,Harry Potter and the Order of the Phoenix (Har...,1:12455,870,4:604283,total:2358637,1,9,Scholastic Inc.,29770,2004,eng,J.K. Rowling,4.5,2:37005,5:1493113,0439358078,3:211781,,
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,1:108202,309,4:1513191,total:6587388,1,11,Scholastic Inc,75911,2003,eng,J.K. Rowling,4.47,2:130310,5:4268227,,3:567458,,
3,4,Harry Potter and the Chamber of Secrets (Harry...,1:11896,352,4:706082,total:2560657,1,11,Scholastic,244,2003,eng,J.K. Rowling,4.42,2:49353,5:1504505,0439554896,3:288821,,
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,1:10128,435,4:630534,total:2610317,1,5,Scholastic Inc.,37093,2004,eng,J.K. Rowling,4.57,2:24849,5:1749958,043965548X,3:194848,,


In [4]:
# Calcular rating promedio de todos los libros (lo que se me pedía, vaya)

booksDF.select(F.avg(F.col("Rating"))).show()

+-----------------+
|      avg(Rating)|
+-----------------+
|2.894242694467414|
+-----------------+



### 2.- Rating promedio de los libros por autor

In [5]:
# No me pide que lo ordene pero por qué no
booksDF.groupBy(F.col("Authors")).agg(F.mean(F.col("Rating"))).orderBy(F.col("avg(Rating)").desc()).show(10,False)

+-------------------+-----------+
|Authors            |avg(Rating)|
+-------------------+-----------+
|Tom          Taylor|5.0        |
|Alexandra Fisher   |5.0        |
|C.F. Gutch         |5.0        |
|Sabrina Brancato   |5.0        |
|Phil West          |5.0        |
|Stephen G. Driggers|5.0        |
|Bernd Vlay         |5.0        |
|Jürgen Höller      |5.0        |
|Avrohom Barash     |5.0        |
|Barbara A. Ganim   |5.0        |
+-------------------+-----------+
only showing top 10 rows



### 3.- Rating promedio de los libros por Publisher

In [6]:
# No me pide que lo ordene pero por qué no
booksDF.groupBy(F.col("Publisher")).agg(F.mean(F.col("Rating"))).orderBy(F.col("avg(Rating)").desc()).show(10,False)

+-----------------------------------------------------+-----------+
|Publisher                                            |avg(Rating)|
+-----------------------------------------------------+-----------+
|Rock Reef Publishing House                           |5.0        |
|L.T.P. Publications                                  |5.0        |
|Susan Chapman Melanson                               |5.0        |
|Browngrotta Arts                                     |5.0        |
|Basic Trauma Life Support International, Incorporated|5.0        |
|National Assn of Office and Industrial Properties    |5.0        |
|City of Manchester Art Galleries                     |5.0        |
|Paris audiovisuel                                    |5.0        |
|Lumen Christi Pr                                     |5.0        |
|Indigo Reef Publishing Inc                           |5.0        |
+-----------------------------------------------------+-----------+
only showing top 10 rows



### 4.- Número promedio de páginas de todos los libros

In [7]:
booksDF.select(F.avg(F.col("pagesNumber"))).show()

+------------------+
|  avg(pagesNumber)|
+------------------+
|276.55165080445977|
+------------------+



### 5.- Número promedio de páginas de todos los libros por autor

In [8]:
# No me pide que lo ordene pero por qué no
booksDF.groupBy(F.col("Authors")).agg(F.mean(F.col("pagesNumber"))).orderBy(F.col("avg(pagesNumber)").desc()).show(10,False)

+------------------------------------+------------------+
|Authors                             |avg(pagesNumber)  |
+------------------------------------+------------------+
|Sandy Redburn                       |1807321.6         |
|A.B. Murphy                         |751507.3333333334 |
|John B. Hare                        |500000.0          |
|Logos Research Systems              |100000.0          |
|Progressive Management              |35428.4375        |
|Timothy McVeigh                     |33133.0           |
|Robert H. Wozniak                   |22100.0           |
|Veterans Affairs Department Research|16153.0           |
|World Spaceflight News              |13942.333333333334|
|Keith Crook                         |9999.0            |
+------------------------------------+------------------+
only showing top 10 rows



### 6.- Número promedio de páginas de todos los libros por Publisher

In [9]:
# No me pide que lo ordene pero por qué no
booksDF.groupBy(F.col("Publisher")).agg(F.mean(F.col("pagesNumber"))).orderBy(F.col("avg(pagesNumber)").desc()).show(10,False)

+-----------------------------------------------------------+------------------+
|Publisher                                                  |avg(pagesNumber)  |
+-----------------------------------------------------------+------------------+
|Crafty Secrets Publications                                |1807321.6         |
|Sacred-texts.com                                           |500000.0          |
|Department of Russian Language and Literature University of|322128.5714285714 |
|Logos Research Systems                                     |100000.0          |
|Encyclopedia Britannica, Incorporated                      |32642.0           |
|Progressive Management                                     |19106.3625        |
|Still Waters Revival Books                                 |10080.142857142857|
|P. Shalom Publications, Incorporated                       |8539.0            |
|Hendrickson Publishers, Inc. (Peabody, MA)                 |6448.0            |
|IEEE/EMB                   

### 7.- Número promedio de libros publicados por autor

In [10]:
booksDF.groupBy(F.col("Authors")).agg(F.count("*").alias("conteoLibros")).agg(F.mean(F.col("conteoLibros")).alias("PromedioAutores")).show()

+------------------+
|   PromedioAutores|
+------------------+
|2.7400268625729134|
+------------------+



### 8.- Ordenar los libros de mayor a menor (Top 15) por número de ratings dados por usuarios (excluir aquellos valores sin rating)

In [11]:
booksDF.where(F.col("Rating") != 0).orderBy(F.col("CountsOfReview").desc()).select(F.col("Id"),F.col("Name"),F.col("Rating"),F.col("CountsOfReview"))\
.show(15,False)

+-------+---------------------------------------------------------+------+--------------+
|Id     |Name                                                     |Rating|CountsOfReview|
+-------+---------------------------------------------------------+------+--------------+
|2767052|The Hunger Games (The Hunger Games, #1)                  |4.33  |154447        |
|41865  |Twilight (Twilight, #1)                                  |3.59  |94850         |
|19063  |The Book Thief                                           |4.37  |87685         |
|4667024|The Help                                                 |4.46  |76040         |
|3      |Harry Potter and the Sorcerer's Stone (Harry Potter, #1) |4.47  |75911         |
|3636   |The Giver (The Giver, #1)                                |4.13  |57034         |
|43641  |Water for Elephants                                      |4.09  |52918         |
|2429135|The Girl with the Dragon Tattoo (Millennium, #1)         |4.14  |52225         |
|136251 |H

### 9.- Obtener Top 5 de ratings más frecuentes otorgados por usuarios

In [12]:
booksDF.groupBy(F.col("Rating")).agg(F.count("*").alias("Conteo")).orderBy(F.col("Conteo").desc()).show(5)

+------+------+
|Rating|Conteo|
+------+------+
|   0.0|451783|
|   4.0|151979|
|   3.0| 87288|
|   5.0| 79827|
|   3.5| 45222|
+------+------+
only showing top 5 rows

