
######Mounting ADB with ADLS Gen2 (For reading the data)

In [0]:
#Show all scopes in Databricks
scopes = dbutils.secrets.listScopes()
display(scopes)

In [0]:
#Show all secrets in scope

secrets = dbutils.secrets.list("lms-scope")
display(secrets)

In [0]:
#Reading secrets from scope

appid = dbutils.secrets.get(scope="lms-scope",key="lms-appid")
service_credential = dbutils.secrets.get(scope="lms-scope",key="lms-secretid")
directoryid = dbutils.secrets.get(scope="lms-scope",key="lms-tenant")

display(appid,service_credential,directoryid)

In [0]:
%python
configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": appid,
          "fs.azure.account.oauth2.client.secret": service_credential,
          "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{directoryid}/oauth2/token"}

# Optionally, you can add <directory-name> to the source URI of your mount point.
dbutils.fs.mount(
  source = "abfss://new@lmsstorageaccount2025.dfs.core.windows.net/",
  mount_point = "/mnt/silver",
  extra_configs = configs)

In [0]:
#Show all data in silver

dbutils.fs.ls("/mnt/silver")


####Books Table

In [0]:
#Reading the data

books = spark.read.format("csv").option("header", "true")\
    .option("inferSchema", "true").load("/mnt/silver/books_table_raw.csv")

display(books)

In [0]:
#Data-type of each column

books.printSchema()

In [0]:
#Descriptive stats of book_price

books.select('book_price').describe().show() 

In [0]:
#Renaming bk to BK in book_id values

# books = books.withColumnRenamed('book_id','BK')
# books.display(10)

In [0]:
#Checking for Null values in each column

from pyspark.sql.functions import col, sum

books.select([sum(col(c).isNull().cast("int")).alias(c) for c in books.columns]).display()

In [0]:
#Checking how many duplicate rows we have

books.distinct().count()  # Count of unique rows
books.count() - books.distinct().count()  # Number of duplicate rows

In [0]:
#Checking for duplicate rows for book_id unique column

from pyspark.sql.functions import col

# Find duplicate book_id values
duplicate_book_ids = books.groupBy("book_id").count().filter(col("count") > 1).select("book_id")

# Join back to original DataFrame to get all rows with duplicate book_id
duplicate_rows = books.join(duplicate_book_ids, on="book_id", how="inner")

# Show duplicate rows
duplicate_rows.display(truncate=False)

In [0]:
#Verifying the duplicate records by considering one value 

books.filter(col("book_id") == "BK023").display()

In [0]:
#Removing the duplicates records from book_id column
books = books.dropDuplicates(["book_id"])

#Verifying whether the duplicate rows removed or not
books.filter(col("book_id") == "BK023").display()

In [0]:
books.display()

#32 duplicate rows have been removed

In [0]:
#Coverting author column values to lower

from pyspark.sql.functions import lower

books = books.withColumn('author', lower(col('author')))
books = books.withColumn('publisher', lower(col('publisher')))

books.display() 


####Books Copies Table

In [0]:
display(dbutils.fs.ls('/mnt/silver/'))

In [0]:
#Reading the book copies dataset

book_copies = spark.read.csv('/mnt/silver/book_copies_table_raw.csv', header=True, inferSchema=True)

book_copies.display()