In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
from pyspark.sql import DataFrame

# create spark session
spark=SparkSession.builder.appName("Data_Validation").getOrCreate()

In [2]:
data_path = "abfss://source@adlssalesproject2448pp1.dfs.core.windows.net"
control_file_path = "abfss://source@adlssalesproject2448pp1.dfs.core.windows.net/CONTROL FILES.csv"

In [3]:
try:
    control_df = spark.read.csv(control_file_path, header=True, inferSchema=True)
    # Show control data to check if it's read correctly
    control_df.show()
except Exception as e:
    print(f"Error reading control file: {e}")


In [4]:
if control_df is None or control_df.head(1) == []:
    raise ValueError("Control file is empty or not read correctly")

# Define schema for control file with filename and expected row count
control_df = control_df.withColumnRenamed("File Name", "file_name").withColumnRenamed("Record count", "expected_row_count")
control_df.show()

In [5]:
control_df.printSchema()
print(control_df.columns)

In [6]:
validation_results = []

for row in control_df.collect():
    file_name = row['File Name ']
    expected_row_count = int(row['expected_row_count'])

    file_path = f"{data_path}/{file_name}"

    try:
        df=spark.read.csv(file_path, header=True)
        row_count = df.count()

        if row_count == expected_row_count:
            validation_results.append((file_name, row_count, "Row count Matched"))
        else:
            validation_results.append((file_name, row_count, "Row count NOT Matched"))
    
    except Exception as e:
        validation_results.append((file_name, None, f"ERROR: {str(e)}"))

In [7]:
print(validation_results)

In [8]:
result_df = spark.createDataFrame(validation_results, ["file_name", "actual_count", "status"])
result_df.show()

In [9]:
output_path = "abfss://validation@adlssalesproject2448pp2.dfs.core.windows.net/Output.csv"
result_df.coalesce(1).write.mode("overwrite").csv(output_path, header=True, mode='overwrite')

In [10]:
error_df = result_df.filter(col("Status") != "Row count Matched")

In [11]:
if error_df.count() > 0:
    error_log_path = "abfss://validation@adlssalesproject2448pp2.dfs.core.windows.net/ErrorLog.csv"
    error_df.coalesce(1).write.mode("overwrite").csv(error_log_path, header=True)

In [12]:
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import pandas as pd

if 'error_df' in locals() and error_df.count() > 0:
    # Convert the Spark DataFrame to Pandas for easy manipulation
    error_pdf = error_df.toPandas()

    # Convert the DataFrame to an HTML table (or string)
    html_table = error_pdf.to_html(index=False)  # Use .to_string() if you prefer plain text

    # Email parameters
    sender_email = "potghanpramod13@gmail.com"
    receiver_email = "pppotghan@gmail.com"
    subject = "Error DataFrame Report"
    smtp_server = "smtp.gmail.com"  # Example: "smtp.gmail.com" for Gmail
    smtp_port = 587  # For Gmail, it's 587 for TLS
    password = "Potghanp13@"  # Use your app password here

    # Create the email body in HTML format
    msg = MIMEMultipart()
    msg['From'] = sender_email
    msg['To'] = receiver_email
    msg['Subject'] = subject

    # Add the DataFrame as part of the email body
    body = f"""
    <html>
        <body>
            <p>Hello Sir/Ma'am,</p>
            <p>Kindly find below details of errors in Data Validation Phase as you requested:</p>
            {html_table}
            <p>Regards,<br>Data Engineer Team</br></p>
        </body>
    </html>
    """
    msg.attach(MIMEText(body, 'html'))

    # Send the email
    try:
        server = smtplib.SMTP(smtp_server, smtp_port)
        server.starttls()  # Secure the connection
        server.login(sender_email, password)
        text = msg.as_string()
        server.sendmail(sender_email, receiver_email, text)
        server.quit()
        print("Email sent successfully!")
    except Exception as e:
        print(f"Failed to send email. Error: {str(e)}")
else:
    print("No errors to report.")
