In [2]:
# importing necessary libraries 

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, LongType, DoubleType, StringType

# connection strings 
CATALOG_URI = "http://nessie:19120/api/v1"  # Nessie Server URI
WAREHOUSE = "s3://warehouse/"               # Minio Address to Write to
STORAGE_URI = "http://172.18.0.3:9000"     # Minio IP address from docker inspect

In [3]:
# connecting with pyspark 

conf = (
    pyspark.SparkConf()
        .setAppName('billing')
        # Include necessary packages
        .set("spark.sql.debug.maxToStringFields", "100000")
        .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,software.amazon.awssdk:bundle:2.24.8,software.amazon.awssdk:url-connection-client:2.24.8')
        # Enable Iceberg and Nessie extensions
    # org.postgresql:postgresql:42.7.3,
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        # Configure Nessie catalog
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', CATALOG_URI)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        # Set Minio as the S3 endpoint for Iceberg storage
        .set('spark.sql.catalog.nessie.s3.endpoint', STORAGE_URI)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)


# creating the connection 
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Session Started")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-591f59ea-ac60-43e3-9173-a97db8e8a2ab;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.0 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.77.1 in central
	found software.amazon.awssdk#bundle;2.24.8 in central
	found software.amazon.awssdk#url-connection-client;2.24.8 in central
	found software.amazon.awssdk#utils;2.24.8 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found software.amazon.awssdk#annotations;2.24.8 in central
	found org.slf4j#slf4j-

Spark Session Started


In [4]:
# confirm the connection, if it shows the tables, then its working 

spark.sql("SHOW TABLES IN nessie").show()

+--------------------+--------------------+-----------+
|           namespace|           tableName|isTemporary|
+--------------------+--------------------+-----------+
|EnergyConsumption...|EnergyConsumption...|      false|
|             billing|    billing_data_raw|      false|
|fact_billing_and_...|fact_billing_and_...|      false|
|fact_billing_and_...|fact_billing_and_...|      false|
|fact_billing_and_...|fact_billing_and_...|      false|
|fact_billing_and_...|fact_billing_and_...|      false|
|fact_billing_and_...|fact_billing_and_...|      false|
|fact_billing_and_...|fact_billing_and_...|      false|
|fact_network_and_...|fact_network_and_...|      false|
|fact_network_and_...|fact_network_and_...|      false|
|       fault_tickets|fault_tickets_dat...|      false|
|        feedermaster|feedermaster_data...|      false|
|       feedervoltage|feedervoltage_dat...|      false|
|           pmtmaster|  pmtmaster_data_raw|      false|
|        power_report|power_report_data...|     

In [5]:
# datapath of all tables 

billing_data = spark.read.table("nessie.billing.billing_data_raw")
fault_tickets = spark.read.table("nessie.fault_tickets.fault_tickets_data_raw")
energy_consumption = spark.read.table("nessie.EnergyConsumptionFeederwise.EnergyConsumptionFeederwise_data_raw")
feeder_master = spark.read.table("nessie.feedermaster.feedermaster_data_raw")
voltage = spark.read.table("nessie.feedervoltage.feedervoltage_data_raw")
pmt_master = spark.read.table("nessie.pmtmaster.pmtmaster_data_raw")
power_report = spark.read.table("nessie.power_report.power_report_data_raw")
recovery_data = spark.read.table("nessie.recovery.recovery_data_raw")

# confirm if the paths are correct 
billing_data.head()
fault_tickets.head()
energy_consumption.head()
feeder_master.head()
voltage.head()
pmt_master.head()
power_report.head()
recovery_data.head()

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

Row(Bank Account Number=1, Contract Account (Partner-Independent Data)=400000001080, Document Date='01.01.2024', Changed On='00000000', Document Type='PY', Reference Specifications from Contract=30000008, Posting date in the document='30.12.2023', IBC='NORTH NAZIMABAD', Rate Category='Commercial with Fixed Charges TOD', Region='C10', Cycle Day=11, Ord-Ind-PSC (OIP)='ORD', Tariff='A2-', Value Date='30.12.2023', Bank Number of Other Bank Key=2106, Reference specifications for bank details='S1', Bank clearing account='AYC015', Cheque Number=None, Additional info=420016572054, Payment Lot='P2401011646M', Item number in a payment lot=2656, Resetting type=None, Reset Document=None, Calendar Year/Month='DEC 2023', No. of Stubs=1, Cash Amount=52091.0, Cheque Amount=0.0, Total Amount=52091.0, Auxillary Cash Amount=0.0, Auxillary Cheque Amount=0.0, Total Auxillary Amount=0.0, Overall Total=52091.0, SD Suspense Amount=None, Cash Suspense Amount=None)

In [None]:
# additional libraries
from pyspark.sql.functions import to_date, split, col, min, max, when
from pyspark.sql.functions import *

# cleaning the data


class InitClean:
    def __init__(self, path):
        self.dataframe = path

    def drop_duplicates(self):
        """
        returns a new DataFrame with entire duplicate rows removed.
        """
        self.dataframe = self.dataframe.dropDuplicates()
        return self

    def replace_garbage(self):
        """
        changes the column type to allow null even
        if there were no null values previously.
        """
        invalid_values = ['[NULL]', '-', 'Not assigned', 'NULL', 'null']
        self.dataframe = self.dataframe.replace(invalid_values, None)
        return self

    def convert_to_date(self, columns: list):
        """
        converts the column type to date, however lists the values as yyyy-mm-dd,
        date operations can work on this type. To change the visualisation format
        use date_format(to_date(col(column_name), "dd.MM.yyyy"), "dd-MM-yyyy")
        although, date_format changes the data type to string but it can be used
        for visualization only.
        """
        for column in columns:
            self.dataframe = self.dataframe.withColumn(
                column, to_date(col(column)), "dd.MM.yyyy")

        for column in columns:
            data_type = self.dataframe.schema[column].dataType
            if isinstance(data_type, DateType):
                print(
                    f"Text to date conversion successful for column: {column}")

        return self

    def break_date_time(self, columns: list):
        """
        withColumn returns a new DataFrame by adding a column
        or replacing the existing column that has the same name.
        """
        for column in columns:
            if column in self.dataframe.columns:
                self.dataframe = self.dataframe \
                    .withColumn(f"{column}_Date", split(col(column), " ").getItem(0))
                self.convert_to_date([f"{column}_Date"])
                self.dataframe = self.dataframe \
                    .withColumn(f"{column}_Time", concat(
                        split(split(col(column), " ").getItem(
                            1), ":").getItem(0), lit(":"),
                        split(split(col(column), " ").getItem(
                            1), ":").getItem(1), lit(":"),
                        regexp_replace(split(split(col(column), " ").getItem(
                            1), ":").getItem(2), "\\s.*", ""),
                        lit(" "),
                        regexp_replace(col(column), ".*\\s([AP]M).*", "$1")
                    ))
        return self

    def drop_empty_columns(self):
        """
        drops columns that are entirely null or empty across all rows.

        Returns:
        self: modified DataFrame with empty columns removed
        """
        # get columns with all null values
        empty_columns = [
            column for column in self.dataframe.columns
            if self.dataframe.filter(col(column).isNotNull()).count() == 0
        ]

        # drop the identified empty columns
        if empty_columns:
            print(f"Dropping empty columns: {empty_columns}")
            self.dataframe = self.dataframe.drop(*empty_columns)

        return self

    def adjust_column_types(self):
        """
        automatically adjusts column data types based on the min and max values in each column.
        handles NULL values appropriately and maintains data consistency.
        """
        for column in self.dataframe.columns:
            try:
                # get current data type
                current_type = self.dataframe.schema[column].dataType

                # skip if column is StringType
                if isinstance(current_type, StringType):
                    continue

                # first ensure nulls are handled properly
                df_clean = self.dataframe.withColumn(
                    column,
                    when(col(column).isin(
                        ['[NULL]', 'NULL', 'null', '-']), None)
                    .otherwise(col(column))
                )

                # for non-double columns, determine the best type
                stats = df_clean.select(
                    col(column).cast(DoubleType()).alias(column)
                ).agg(
                    min(column).alias("min_val"),
                    max(column).alias("max_val")
                ).collect()[0]

                min_val, max_val = stats["min_val"], stats["max_val"]

                # skip if both values are None
                if min_val is None and max_val is None:
                    continue

                # determine appropriate type based on value range
                if all(x is not None for x in [min_val, max_val]):
                    if all(isinstance(x, (int, float)) for x in [min_val, max_val]):
                        if min_val >= -9223372036854775808 and max_val <= 9223372036854775807:
                            new_type = LongType()
                        else:
                            new_type = DoubleType()

                        # apply the new type
                        self.dataframe = df_clean.withColumn(
                            column,
                            col(column).cast(new_type)
                        )

            except Exception as e:
                print(f"Could not process column {column}: {str(e)}")
                continue

        return self

    def get_dataframe(self):
        """
        returns the cleaned dataframe.
        """
        return self.dataframe


In [None]:
# driver code 

# Functions: 
# drop_duplicates, replace_garbage, convert_to_date, break_date_time, drop_empty_columns, adjust_column_types

# Databases 
# billing_data, fault_tickets, energy_consumption_feeder_wise, feeder_master, feeder_voltage, pmt_master, power_report, recovery_data 

# Billing Table
billing_cleaned = InitClean(billing_data) \
    .drop_duplicates() \
    .convert_to_date(["Posting date in the document", "Due Date (Print Doc)", "Issue Date (Print Doc's Posting Date)"]) \
    .replace_garbage() \
    .drop_empty_columns() \
    .adjust_column_types()

# Energy Comsumption Feeder Wise 
energy_con_fw_cleaned = InitClean(energy_consumption) \
    .drop_duplicates() \
    .convert_to_date(["Sentout Date"]) \
    .replace_garbage() \
    .adjust_column_types() \
    .drop_empty_columns()

# Cleaned Fault Tickets  
# convert_to_time: columns that are being broken into date and time, and AreaSecureTAT, TrueCallerTAT
# checks for if a column is empty or null then skip 
fault_tickets_clean = InitClean(fault_tickets)\
    .drop_duplicates() \
    .replace_garbage() \
    .drop_empty_columns() \
    .adjust_column_types() \
    .break_date_time(["Ticket Created at", "Notification Time", "CompletedAt", "AreaSecuredTime", "TrueCallerTime"]) \

# Cleaned Feeder Master Data
feeder_mas_cleaned = InitClean(feeder_master) \
    .drop_duplicates() \
    .replace_garbage() \
    .drop_empty_columns() \
    .adjust_column_types()

# Cleaned Voltage Data
# Need to convert apply convert_to_time to Time table 
voltage_cleaned = InitClean(voltage) \
    .drop_duplicates() \
    .replace_garbage() \
    .drop_empty_columns() \
    .adjust_column_types()

# Cleaned PMT Master Data
pmt_mast_cleaned = InitClean(pmt_master) \
    .drop_duplicates() \
    .replace_garbage() \
    .drop_empty_columns() \
    .adjust_column_types()

# Cleaned Power Report Data 
# need to apply convert_to_time to Time column 
power_rep_cleaned = InitClean(power_report) \
    .drop_duplicates() \
    .replace_garbage() \
    .drop_empty_columns() \
    .adjust_column_types()

# Cleaned Recovery Data
recovery_cleaned = InitClean(recovery_data) \
    .drop_duplicates() \
    .replace_garbage() \
    .drop_empty_columns() \
    .adjust_column_types()

# to check the results
cleaned_billing.dtypes
cleand_ecfw.dtypes
fault_tickets.dtypes

In [21]:
# Stop SparkSession
# spark.stop()
recovery_data.dtypes

[('Bank Account Number', 'int'),
 ('Contract Account (Partner-Independent Data)', 'bigint'),
 ('Document Date', 'string'),
 ('Changed On', 'string'),
 ('Document Type', 'string'),
 ('Reference Specifications from Contract', 'int'),
 ('Posting date in the document', 'string'),
 ('IBC', 'string'),
 ('Rate Category', 'string'),
 ('Region', 'string'),
 ('Cycle Day', 'int'),
 ('Ord-Ind-PSC (OIP)', 'string'),
 ('Tariff', 'string'),
 ('Value Date', 'string'),
 ('Bank Number of Other Bank Key', 'int'),
 ('Reference specifications for bank details', 'string'),
 ('Bank clearing account', 'string'),
 ('Cheque Number', 'int'),
 ('Additional info', 'bigint'),
 ('Payment Lot', 'string'),
 ('Item number in a payment lot', 'int'),
 ('Resetting type', 'int'),
 ('Reset Document', 'bigint'),
 ('Calendar Year/Month', 'string'),
 ('No. of Stubs', 'int'),
 ('Cash Amount', 'double'),
 ('Cheque Amount', 'double'),
 ('Total Amount', 'double'),
 ('Auxillary Cash Amount', 'double'),
 ('Auxillary Cheque Amount', 

In [None]:
# FUTURE WORK 


# from pyspark.sql.functions import col, to_date, regexp_replace
# from dateutil.parser import parse
# from pyspark.sql.types import DateType, StringType

# def auto_detect_date_columns(df):
#     """
#     Automatically detect potential date columns based on content
#     """
#     date_columns = []
#     for column, dtype in df.dtypes:
#         # Only check string columns
#         if dtype == 'string':
#             # Sample first few rows to check date pattern
#             sample = df.select(column).limit(10).rdd.flatMap(lambda x: x).collect()
            
#             for value in sample:
#                 if value and isinstance(value, str):
#                     try:
#                         # Try parsing the value as a date
#                         parsed_date = parse(value, fuzzy=False)
#                         date_columns.append(column)
#                         break
#                     except:
#                         continue
    
#     return date_columns

# def intelligent_date_conversion(df, metadata_table=None):
#     """
#     Intelligently convert date columns with metadata tracking
    
#     :param df: Input DataFrame
#     :param metadata_table: Optional tracking table for previous conversions
#     :return: Converted DataFrame
#     """
#     # Detect potential date columns
#     potential_date_columns = auto_detect_date_columns(df)
    
#     # If metadata table exists, filter out already converted columns
#     if metadata_table:
#         converted_columns = metadata_table.select('column_name').rdd.flatMap(lambda x: x).collect()
#         potential_date_columns = [col for col in potential_date_columns if col not in converted_columns]
    
#     # Attempt conversion for each potential date column
#     for column in potential_date_columns:
#         # Try multiple common date formats
#         date_formats = [
#             'dd.MM.yyyy', 
#             'MM.dd.yyyy', 
#             'yyyy.MM.dd', 
#             'dd-MM-yyyy', 
#             'MM-dd-yyyy', 
#             'yyyy-MM-dd'
#         ]
        
#         for date_format in date_formats:
#             try:
#                 # Attempt conversion with current format
#                 df = df.withColumn(
#                     column, 
#                     to_date(col(column), date_format)
#                 )
                
#                 # If successful, break out of format loop
#                 break
#             except:
#                 continue
    
#     return df




# Automatic Date Detection:

# Uses dateutil.parser to intelligently detect potential date columns
# Checks only string columns
# Samples first few rows to determine date-like content


# Intelligent Conversion:

# Tries multiple common date formats
# Handles different international date formats
# Skips already converted columns if metadata tracking is used


# Metadata Tracking:

# Optional tracking of already converted columns
# Prevents redundant conversions in future runs


# Additional Recommendations:

# Create a metadata table to track:

# Columns converted
# Conversion date
# Conversion format


# Add logging to track conversion attempts and successes
# Handle edge cases like:

# Columns with mixed date formats
# Partially invalid date columns



# Considerations:

# Performance overhead for large datasets
# Might need fine-tuning based on your specific data patterns
# Fuzzy parsing can sometimes misinterpret non-date strings



# FUTURE WORK 
#                           - NEED TO INCLUDE ERROR HANDLING/ TRY EXCEPT CONDITIONS 
#                           - NEED TO PERFORM UNIT TESTING AND CHECKS FOR EVERY CONDITION, EG: DATE CONVERSION