In [1]:
import os

import findspark
findspark.init()

from dotenv import load_dotenv
import matplotlib.pyplot as plt
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, FloatType, StringType, StructField, StructType
from pyspark_dist_explore import hist
from pyspark.sql import types

import helpers as H

%matplotlib inline

conf = SparkConf()
conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar")
conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar")

sc = SparkContext.getOrCreate(conf=conf)
sc.setCheckpointDir("temp_graphframes/")
spark = SparkSession(sc)

spark.udf.registerJavaFunction(
    "jaro_winkler_sim",
    "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
    types.DoubleType(),
)    
spark.udf.registerJavaFunction(
    "Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType()
)
    
load_dotenv()

GOODREADS_BOOKS_PATH = os.getenv("GOODREADS_BOOKS_PATH")
SPL_INVENTORY_PATH = os.getenv("SPL_INVENTORY_PATH")

In [2]:
book_schema = StructType([
    StructField("bookID", StringType(), True),
    StructField("title", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("average_rating", FloatType(), True),
    StructField("isbn", StringType(), True),
    StructField("isbn13", StringType(), True),
    StructField("language_code", StringType(), True),
    StructField("num_pages", IntegerType(), True),    
    StructField("ratings_count", IntegerType(), True),    
    StructField("text_reviews_count", IntegerType(), True),
    StructField("publication_date", StringType(), True),
    StructField("publicater", StringType(), True),            
])
goodreads_df = spark.read.schema(book_schema).option("header", "true").csv(GOODREADS_BOOKS_PATH)
# Replace publication date with a datetime
goodreads_df = goodreads_df.withColumn(
    "publication_date", 
    F.to_timestamp(goodreads_df.publication_date, "M/d/yyyy"),
)

spl_df = spark.read.option("header", "true").csv(SPL_INVENTORY_PATH)

In [3]:
spl_auth_df = (
    spl_df
    .select(
        F.monotonically_increasing_id().alias("unique_id"),
        spl_df.Author,
    )
    .filter(spl_df.Author.isNotNull())
    .drop_duplicates(["Author"])
    .sort(spl_df.Author)
)
spl_auth_df.show(20, False)
spl_auth_count_df = spl_auth_df.count()
print(spl_auth_count_df)

+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|unique_id  |Author                                                                                                                                                                                    |
+-----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|25769820997| "" by George Pullen Jackson."                                                                                                                                                            |
|8589937466 | ""681 Station Road                                                                                                                                                                    

In [4]:
spl_auth_df.sample(False, fraction=1.0 * 20 / spl_auth_count_df).limit(20).show(20, False)

+-----------+------------------------------------------+
|unique_id  |Author                                    |
+-----------+------------------------------------------+
|234268     |Adams, Edward Dean, 1846-1931             |
|25769998992|Bell, Harold K.                           |
|25769945116|Chen, Congzhou, 1918-2000.                |
|8590158285 |Codona (Musical group)                    |
|8590104499 |Davidowitz, Steven                        |
|241726     |Ewing, Alex C.                            |
|25769857408|Farber, Samuel, 1939-                     |
|17179932801|Free Public Library of Jersey City        |
|27505      |Gonzales, Mark, 1975-                     |
|8590143091 |Imamura, Kikuko.                          |
|8589959168 |Jansen, Robert B., 1922-                  |
|17179933658|Mancuso, Joseph                           |
|17179887777|McMichael, James, 1939-                   |
|17179997376|Reinarz, Jonathan                         |
|8589952527 |Roche, Catherine (

In [7]:
def format_spl_author(author):
    parts = author.split(", ")
    if len(parts) == 2:
        return parts[1] + " " + parts[0]
    elif len(parts) == 3:
        #assert any(char.isdigit() for char in parts[2])
        return parts[1] + " " + parts[0]    
    else:
        return author
        
format_spl_author_udf = F.udf(format_spl_author)

In [11]:
spl_auth_formatted_df = spl_auth_df.withColumn("author_formatted", format_spl_author_udf(spl_auth_df.Author))
spl_auth_formatted_df.sample(False, fraction=1.0 * 20 / spl_auth_count_df).limit(20).show(20, False)

+-----------+-------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+
|unique_id  |Author                                                                                           |author_formatted                                                                                 |
+-----------+-------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------+
|23750      |Barkow, Henriette                                                                                |Henriette Barkow                                                                                 |
|8589962188 |Chang, T. Susan                                                                                  |T. Susan Chang                                   

In [13]:
goodreads_authors_df = goodreads_df.withColumn(
    "authors", 
    F.explode(F.split(goodreads_df.authors, "/")),
)
goodreads_authors_df.show(20)

+------+--------------------+--------------------+--------------+----------+-------------+-------------+---------+-------------+------------------+-------------------+------------------+
|bookID|               title|             authors|average_rating|      isbn|       isbn13|language_code|num_pages|ratings_count|text_reviews_count|   publication_date|        publicater|
+------+--------------------+--------------------+--------------+----------+-------------+-------------+---------+-------------+------------------+-------------------+------------------+
|     1|Harry Potter and ...|        J.K. Rowling|          4.57|0439785960|9780439785969|          eng|      652|      2095690|             27591|2006-09-16 00:00:00|   Scholastic Inc.|
|     1|Harry Potter and ...|       Mary GrandPré|          4.57|0439785960|9780439785969|          eng|      652|      2095690|             27591|2006-09-16 00:00:00|   Scholastic Inc.|
|     2|Harry Potter and ...|        J.K. Rowling|          4.49|