In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs, regexp_replace, udf
from pyspark.sql.types import StringType

# Initialize Spark session
spark = SparkSession.builder.appName("NumberCategorization").getOrCreate()

# Read CSV file
data_path = "/content/Number_Based_Discrepancies_Fixed.csv"  # Replace with actual file path
df = spark.read.csv(data_path, header=True, inferSchema=False)  # Read everything as string to preserve formatting

# Preserve original values before transformation
df = df.withColumn("Original_Source", col("Source"))
df = df.withColumn("Original_Destination", col("Destination"))

# Remove currency symbols and commas, then cast to double
currency_symbols = "$€£¥₹₩₽₦₺₨؋৳₴₲₪₵₡₫₭₮₯₰₱₲₳₤₶₸₷₠₣₧₾"  # Extended list of currency symbols
df = df.withColumn("Source", regexp_replace(col("Source"), f"[{currency_symbols},]", "").cast("double"))
df = df.withColumn("Destination", regexp_replace(col("Destination"), f"[{currency_symbols},]", "").cast("double"))

# Compute absolute difference
df = df.withColumn("Difference", abs(col("Source") - col("Destination")))

# Define classification function
def classify_discrepancy(original_source, original_dest, source, dest, diff):
    import re  # Ensure regex module is available inside UDF

    # Check if removing leading zeros changes the value
    if original_source.lstrip("0") != original_source or original_dest.lstrip("0") != original_dest:
        return "Leading Zero Issue"
    # Check for decimal precision difference
    if original_source != original_dest:
        return "Decimal Precision Difference"
    if "," in original_source or "," in original_dest:
        return "Thousands Separator Difference"
    if re.match(r"^[0-9]+\.[0-9]+E[0-9]+$", original_source) or re.match(r"^[0-9]+\.[0-9]+E[0-9]+$", original_dest):
        return "Scientific Notation Difference"
    # Check for currency symbol difference only if both have the same currency and difference is < 1
    if diff < 1:
        return "Currency Symbol Difference"
    return "Other"

# Register function as UDF
classify_udf = udf(classify_discrepancy, StringType())

# Apply UDF to classify discrepancies
df = df.withColumn("Category", classify_udf(col("Original_Source"), col("Original_Destination"), col("Source"), col("Destination"), col("Difference")))

# Show results
df.show()


In [None]:
df.show(7000)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, abs, regexp_replace, udf
from pyspark.sql.types import StringType

# Initialize Spark session
spark = SparkSession.builder.appName("NumberCategorization").getOrCreate()

# Read CSV file
data_path = "/content/Number_Based_Discrepancies_Fixed.csv"  # Replace with actual file path
df = spark.read.csv(data_path, header=True, inferSchema=False)  # Read everything as string to preserve formatting

# Preserve original values before transformation
df = df.withColumn("Original_Source", col("Source"))
df = df.withColumn("Original_Destination", col("Destination"))

# Define classification function
def classify_discrepancy(original_source, original_dest):
    import re  # Ensure regex module is available inside UDF

    # Check if removing leading zeros changes the value
    if original_source.lstrip('0') != original_source or original_dest.lstrip('0') != original_dest:
        return "Leading Zero Issue"
    # Check for decimal precision difference
    if original_source != original_dest:
        return "Decimal Precision Difference"
    if "," in original_source or "," in original_dest:
        return "Thousands Separator Difference"
    if re.match(r"^[0-9]+\.[0-9]+E[0-9]+$", original_source) or re.match(r"^[0-9]+\.[0-9]+E[0-9]+$", original_dest):
        return "Scientific Notation Difference"
    # Add additional checks as necessary
    return "Other"

# Register function as UDF
classify_udf = udf(classify_discrepancy, StringType())

# Apply UDF to classify discrepancies before converting to numeric types
df = df.withColumn("Category", classify_udf(col("Original_Source"), col("Original_Destination")))

# Remove currency symbols and commas, then cast to double for numeric operations
currency_symbols = "$€£¥₹₩₽₦₺₨؋৳₴₲₪₵₡₫₭₮₯₰₱₲₳₤₶₸₷₠₣₧₾"  # Extended list of currency symbols
df = df.withColumn("Source", regexp_replace(col("Original_Source"), f"[{currency_symbols},]", "").cast("double"))
df = df.withColumn("Destination", regexp_replace(col("Original_Destination"), f"[{currency_symbols},]", "").cast("double"))

# Compute absolute difference
df = df.withColumn("Difference", abs(col("Source") - col("Destination")))

# Show results
df.show()


In [None]:
df.show(7000)

In [None]:
def leading_zero_check(value1, value2):
    stripped_value1 = value1.lstrip('0') or '0'  # Ensures empty strings are considered as '0'
    stripped_value2 = value2.lstrip('0') or '0'  # Ensures empty strings are considered as '0'
    return are_equal

# Example usage:
value1 = "123"
value2 = "123"
are_equal = leading_zero_check(value1, value2)
print("Are equal after removing leading zeros:", are_equal)
print("Had leading zeros:", had_leading_zeros)


In [None]:
# Define the function as provided
def negative_check(value1, value2):
    try:
        # Convert values to float and compare their absolute values
        num1 = float(value1)
        num2 = float(value2)
        print(abs(num1), abs(num2))
        return abs(num1) == abs(num2)
    except ValueError:
        # Return False if there's an error converting to float (e.g., if the input is not numeric)
        return False

# Test cases to validate the function
test_cases = [
    ("-123", "123"),  # True: both numbers are the same magnitude, opposite signs
    ("456", "-456"),  # True: both numbers are the same magnitude, opposite signs
    ("789", "789"),   # True: both numbers are the same, same sign
    ("123.5", "-123.5"),  # True: same magnitude, opposite signs
    ("100", "200"),   # False: different magnitudes
    ("abc", "123"),   # False: non-numeric input
    ("-100", "-100"), # True: same numbers, same negative sign
    ("", ""),         # False: empty strings
    ("0", "-0"),      # True: zero is the same regardless of sign
    ("0.00001", "-0.00001")  # True: very small numbers, opposite signs
]

# Running the test cases
for value1, value2 in test_cases:
    result = negative_check(value1, value2)
    print(f"negative_check({value1}, {value2}) = {result}")



In [None]:
num1,num2="-123", "123"
num1 = float(value1)
num2 = float(value2)
abs(num1)

In [None]:
abs(-123)

In [None]:
# An integer
var = -94
print('Absolute value of integer is:', abs(var))
