# Dedupe publishers

This notebook tries to deduplicate the publishers column

In [1]:
import os

import findspark
findspark.init()

from dotenv import load_dotenv
import matplotlib.pyplot as plt
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, FloatType, StringType, StructField, StructType
from pyspark_dist_explore import hist
from pyspark.sql import types

import helpers as H

%matplotlib inline

conf = SparkConf()
conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar")
conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar")

sc = SparkContext.getOrCreate(conf=conf)
sc.setCheckpointDir("temp_graphframes/")
spark = SparkSession(sc)

spark.udf.registerJavaFunction(
    "jaro_winkler_sim",
    "uk.gov.moj.dash.linkage.JaroWinklerSimilarity",
    types.DoubleType(),
)    
spark.udf.registerJavaFunction(
    "Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType()
)
    
load_dotenv()

GOODREADS_BOOKS_PATH = os.getenv("GOODREADS_BOOKS_PATH")
SPL_INVENTORY_PATH = os.getenv("SPL_INVENTORY_PATH")

In [2]:
book_schema = StructType([
    StructField("bookID", StringType(), True),
    StructField("title", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("average_rating", FloatType(), True),
    StructField("isbn", StringType(), True),
    StructField("isbn13", StringType(), True),
    StructField("language_code", StringType(), True),
    StructField("num_pages", IntegerType(), True),    
    StructField("ratings_count", IntegerType(), True),    
    StructField("text_reviews_count", IntegerType(), True),
    StructField("publication_date", StringType(), True),
    StructField("publicater", StringType(), True),            
])
goodreads_df = spark.read.schema(book_schema).option("header", "true").csv(GOODREADS_BOOKS_PATH)
# Replace publication date with a datetime
goodreads_df = goodreads_df.withColumn(
    "publication_date", 
    F.to_timestamp(goodreads_df.publication_date, "M/d/yyyy"),
)

spl_df = spark.read.option("header", "true").csv(SPL_INVENTORY_PATH)

In [3]:
spl_pub_df = (
    spl_df
    .select(
        F.monotonically_increasing_id().alias("unique_id"),
        spl_df.Publisher,
    )
    .filter(spl_df.Publisher.isNotNull())
    .drop_duplicates(["Publisher"])
    .sort(spl_df.Publisher)
)
spl_pub_df.show(20, False)

+-----------+--------------------------------------------------------------------------------------------------------------------------------------+
|unique_id  |Publisher                                                                                                                             |
+-----------+--------------------------------------------------------------------------------------------------------------------------------------+
|25769979298| ""Ready track""                                                                                                                      |
|8589980192 | 107.7 cu. in. disp."                                                                                                                 |
|162537     | 110                                                                                                                                  |
|17180088828| 1746-1896. Edited by Horace C. Hovey."                                                      

In [4]:
pub_count = spl_pub_df.count()
spl_pub_df.sample(False, fraction=1.0 * 20 / pub_count).limit(20).show(20, False)

+-----------+----------------------------------------------------------------------------------------------+
|unique_id  |Publisher                                                                                     |
+-----------+----------------------------------------------------------------------------------------------+
|25770057374| Sylvie Blum."                                                                                |
|38070      |Austro Mechana Historic Recordings,                                                           |
|154445     |Book Division, Fairchild Publications,                                                        |
|4890       |Daemon Records,                                                                               |
|8589940706 |Editorial LIBSA,                                                                              |
|8590045755 |Educational Department, League of Nations Assoc., Inc.,                                       |
|8590216420 |G.C. M

In [5]:
spl_pub_df.count()

96894

In [6]:
from splink import Splink

In [7]:
settings = {
    "link_type": "dedupe_only",
    "blocking_rules": [],
    "comparison_columns": [
        {
            "col_name": "Publisher",
            "term_frequency_adjustments": True,
        },
    ],
}

# linker = Splink(settings, spark=spark, df_or_dfs=spl_pub_df)
# df_e = linker.get_scored_comparisons()