In [None]:
import pandas as pd
import random
import numpy as np

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Function to generate rounded off numbers
def generate_rounded_off_numbers():
    num = round(random.uniform(1, 10000), 2)
    rounded = round(num)
    return f"{num}", f"{rounded}"

# Function to generate decimal precision differences
def generate_decimal_precision_difference():
    num = round(random.uniform(1, 10000), random.randint(1, 5))
    diff_precision = f"{num:.{random.randint(1, 5)}f}"
    return f"{num}", f"{diff_precision}"

# Function to generate leading zero issues correctly
def generate_leading_zero_issue():
    num = random.randint(1, 999999)
    leading_zero = f"{num:08d}"  # Ensuring at least 8-digit leading zeros
    return leading_zero, str(num)

# Function to generate thousands separator differences
def generate_thousands_separator_difference():
    num = random.randint(1000, 9999999)
    with_separator = f"{num:,}"
    return with_separator, str(num)

# Function to generate negative vs positive numbers
def generate_negative_vs_positive():
    num = random.randint(1, 10000)
    return f"{-num}", f"{num}"

# Function to generate scientific notation differences with a broader range
def generate_scientific_notation_difference():
    num = random.uniform(1e-8, 1e8)  # Very small and very large numbers
    sci_notation = "{:.4E}".format(num)  # More decimal places for better distinction
    return f"{num}", sci_notation

# Function to generate currency symbol differences with proper encoding and notation handling
def generate_currency_symbol_difference():
    currency_symbols = ['$', '€', '£', '¥', '₹', '₽', '₩', '₺', '₴', '₦']
    num = round(random.uniform(1, 10000), 2)
    symbol = random.choice(currency_symbols)
    with_symbol = f"{symbol}{num:,.2f}".replace(",", "")  # Remove comma to prevent formatting errors
    with_code = f"{num:.2f} {symbol}"
    return with_symbol, with_code

# Number of records per category
num_records = 1000

# Generate datasets for each category
data_categories = {
    "Rounded Off Numbers": [generate_rounded_off_numbers() for _ in range(num_records)],
    "Decimal Precision Difference": [generate_decimal_precision_difference() for _ in range(num_records)],
    "Leading Zero Issue": [generate_leading_zero_issue() for _ in range(num_records)],
    "Thousands Separator Difference": [generate_thousands_separator_difference() for _ in range(num_records)],
    "Negative vs Positive": [generate_negative_vs_positive() for _ in range(num_records)],
    "Scientific Notation Difference": [generate_scientific_notation_difference() for _ in range(num_records)],
    "Currency Symbol Difference": [generate_currency_symbol_difference() for _ in range(num_records)],
}

# Convert to DataFrame format
df_list = []
for category, records in data_categories.items():
    df_temp = pd.DataFrame(records, columns=["Source", "Destination"])
    df_temp["Label"] = category
    df_list.append(df_temp)

# Combine all categories into a single DataFrame
df_number_discrepancies = pd.concat(df_list, ignore_index=True)

# Save the dataset to a CSV file with UTF-8 encoding to avoid corruption
file_path = "Number_Based_Discrepancies_Fixed.csv"
df_number_discrepancies.to_csv(file_path, index=False, encoding="utf-8")

print(f"Dataset has been saved to {file_path}")


In [None]:
! pip install rapidfuzz

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, when
from pyspark.sql.types import ArrayType, DoubleType, StringType
import re
from rapidfuzz import fuzz

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, IntegerType
import re

# Initialize Spark Session
spark = SparkSession.builder.appName("NumberDiscrepancyAnalysis").getOrCreate()

# Define function to apply different logics for each discrepancy type
def compute_discrepancy_scores(label, source, destination):
    """
    Assigns a score based on the discrepancy type.
    Each type has a custom logic to determine whether the discrepancy is valid.
    """
    try:
        # Remove non-numeric characters for comparison (except negatives & decimals)
        source_clean = re.sub(r"[^\d\.\-E]", "", source)
        destination_clean = re.sub(r"[^\d\.\-E]", "", destination)

        try:
            source_num = float(source_clean)
            destination_num = float(destination_clean)
        except ValueError:
            return [0] * 7  # If conversion fails, return zero for all categories

        # Initialize all scores with 0
        scores = [0] * 7

        # Apply logic for each category
        if label == "Rounded Off Numbers":
            if "E" not in source and "E" not in destination:
              scores[0] = 100 if abs(source_num - destination_num) < 1 else 0

        elif label == "Decimal Precision Difference":
            source_str = "{:.10f}".format(source_num).rstrip('0')  # Convert to string and remove trailing zeros
            destination_str = "{:.10f}".format(destination_num).rstrip('0')
            scores[1] = 100 if  int(float(source_str)) == int(float(destination_str)) else 0

        elif label == "Leading Zero Issue":
            scores[2] = 100 if source_clean.lstrip("0") == destination_clean.lstrip("0") else 0

        elif label == "Thousands Separator Difference":
            scores[3] = 100 if source_clean.replace(",", "") == destination_clean.replace(",", "") else 0

        elif label == "Negative vs Positive":
            scores[4] = 100 if abs(source_num) == abs(destination_num) and source_num != destination_num else 0

        elif label == "Scientific Notation Difference":
          scores[5] = 100 if abs(source_num - destination_num) < 5 else 0
        elif label == "Currency Symbol Difference":
            scores[6] = 100 if source_clean == destination_clean else 0

        return scores

    except Exception:
        return [0] * 7  # Default low scores if an error occurs

# Register UDF in Spark
discrepancy_udf = udf(lambda label, src, dst: compute_discrepancy_scores(label, src, dst), ArrayType(IntegerType()))

# Load dataset
file_path = "/content/Number_Based_Discrepancies_Fixed.csv"
df_spark = spark.read.csv(file_path, header=True, inferSchema=True)

# Apply the discrepancy detection function
df_spark = df_spark.withColumn("Discrepancy Scores", discrepancy_udf(col("Label"), col("Source"), col("Destination")))

# Extract individual discrepancy columns
df_spark = df_spark.withColumn("Rounded Off Score", col("Discrepancy Scores")[0]) \
                   .withColumn("Decimal Precision Score", col("Discrepancy Scores")[1]) \
                   .withColumn("Leading Zero Score", col("Discrepancy Scores")[2]) \
                   .withColumn("Thousands Separator Score", col("Discrepancy Scores")[3]) \
                   .withColumn("Negative vs Positive Score", col("Discrepancy Scores")[4]) \
                   .withColumn("Scientific Notation Score", col("Discrepancy Scores")[5]) \
                   .withColumn("Currency Symbol Score", col("Discrepancy Scores")[6]) \
                   .drop("Discrepancy Scores")

# Save the processed dataset in CSV format
csv_output_path = "Scored_Number_Based_Discrepancies.csv"
df_spark.toPandas().to_csv(csv_output_path, index=False, encoding="utf-8")

print(f"Scored dataset saved to: {csv_output_path}")
