In [None]:
import pandas as pd
import random
import numpy as np

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Function to generate rounded off numbers
def generate_rounded_off_numbers():
    num = round(random.uniform(1, 10000), 2)
    rounded = round(num)
    return f"{num}", f"{rounded}"

# Function to generate decimal precision differences
def generate_decimal_precision_difference():
    num = round(random.uniform(1, 10000), random.randint(1, 5))
    diff_precision = f"{num:.{random.randint(1, 5)}f}"
    return f"{num}", f"{diff_precision}"

# Function to generate leading zero issues correctly
def generate_leading_zero_issue():
    num = random.randint(1, 999999)
    leading_zero = f"{num:08d}"  # Ensuring at least 8-digit leading zeros
    return leading_zero, str(num)

# Function to generate thousands separator differences
def generate_thousands_separator_difference():
    num = random.randint(1000, 9999999)
    with_separator = f"{num:,}"
    return with_separator, str(num)

# Function to generate negative vs positive numbers
def generate_negative_vs_positive():
    num = random.randint(1, 10000)
    return f"{-num}", f"{num}"

# Function to generate scientific notation differences with a broader range
def generate_scientific_notation_difference():
    num = random.uniform(1e-8, 1e8)  # Very small and very large numbers
    sci_notation = "{:.4E}".format(num)  # More decimal places for better distinction
    return f"{num}", sci_notation

# Function to generate currency symbol differences with proper encoding and notation handling
def generate_currency_symbol_difference():
    currency_symbols = ['$', '€', '£', '¥', '₹', '₽', '₩', '₺', '₴', '₦']
    num = round(random.uniform(1, 10000), 2)
    symbol = random.choice(currency_symbols)
    with_symbol = f"{symbol}{num:,.2f}".replace(",", "")  # Remove comma to prevent formatting errors
    with_code = f"{num:.2f} {symbol}"
    return with_symbol, with_code

# Number of records per category
num_records = 1000

# Generate datasets for each category
data_categories = {
    "Rounded Off Numbers": [generate_rounded_off_numbers() for _ in range(num_records)],
    "Decimal Precision Difference": [generate_decimal_precision_difference() for _ in range(num_records)],
    "Leading Zero Issue": [generate_leading_zero_issue() for _ in range(num_records)],
    "Thousands Separator Difference": [generate_thousands_separator_difference() for _ in range(num_records)],
    "Negative vs Positive": [generate_negative_vs_positive() for _ in range(num_records)],
    "Scientific Notation Difference": [generate_scientific_notation_difference() for _ in range(num_records)],
    "Currency Symbol Difference": [generate_currency_symbol_difference() for _ in range(num_records)],
}

# Convert to DataFrame format
df_list = []
for category, records in data_categories.items():
    df_temp = pd.DataFrame(records, columns=["Source", "Destination"])
    df_temp["Label"] = category
    df_list.append(df_temp)

# Combine all categories into a single DataFrame
df_number_discrepancies = pd.concat(df_list, ignore_index=True)

# Save the dataset to a CSV file with UTF-8 encoding to avoid corruption
file_path = "Number_Based_Discrepancies_Fixed.csv"
df_number_discrepancies.to_csv(file_path, index=False, encoding="utf-8")

print(f"Dataset has been saved to {file_path}")


In [None]:
! pip install rapidfuzz

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, when
from pyspark.sql.types import ArrayType, DoubleType, StringType
import re
from rapidfuzz import fuzz

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, IntegerType
import re

# Initialize Spark Session
spark = SparkSession.builder.appName("NumberDiscrepancyAnalysis").getOrCreate()

# Define function to apply different logics for each discrepancy type
def compute_discrepancy_scores(label, source, destination):
    """
    Assigns a score based on the discrepancy type.
    Each type has a custom logic to determine whether the discrepancy is valid.
    """
    try:
        # Remove non-numeric characters for comparison (except negatives & decimals)
        source_clean = re.sub(r"[^\d\.\-E]", "", source)
        destination_clean = re.sub(r"[^\d\.\-E]", "", destination)

        try:
            source_num = float(source_clean)
            destination_num = float(destination_clean)
        except ValueError:
            return [0] * 7  # If conversion fails, return zero for all categories

        # Initialize all scores with 0
        scores = [0] * 7

        # Apply logic for each category
        if label == "Rounded Off Numbers":
            if "E" not in source and "E" not in destination:
              scores[0] = 100 if abs(source_num - destination_num) < 1 else 0

        elif label == "Decimal Precision Difference":
            source_str = "{:.10f}".format(source_num).rstrip('0')  # Convert to string and remove trailing zeros
            destination_str = "{:.10f}".format(destination_num).rstrip('0')
            scores[1] = 100 if  int(float(source_str)) == int(float(destination_str)) else 0

        elif label == "Leading Zero Issue":
            scores[2] = 100 if source_clean.lstrip("0") == destination_clean.lstrip("0") else 0

        elif label == "Thousands Separator Difference":
            scores[3] = 100 if source_clean.replace(",", "") == destination_clean.replace(",", "") else 0

        elif label == "Negative vs Positive":
            scores[4] = 100 if abs(source_num) == abs(destination_num) and source_num != destination_num else 0

        elif label == "Scientific Notation Difference":
          scores[5] = 100 if abs(source_num - destination_num) < 5 else 0
        elif label == "Currency Symbol Difference":
            scores[6] = 100 if source_clean == destination_clean else 0

        return scores

    except Exception:
        return [0] * 7  # Default low scores if an error occurs

# Register UDF in Spark
discrepancy_udf = udf(lambda label, src, dst: compute_discrepancy_scores(label, src, dst), ArrayType(IntegerType()))

# Load dataset
file_path = "/content/Number_Based_Discrepancies_Fixed.csv"
df_spark = spark.read.csv(file_path, header=True, inferSchema=True)

# Apply the discrepancy detection function
df_spark = df_spark.withColumn("Discrepancy Scores", discrepancy_udf(col("Label"), col("Source"), col("Destination")))

# Extract individual discrepancy columns
df_spark = df_spark.withColumn("Rounded Off Score", col("Discrepancy Scores")[0]) \
                   .withColumn("Decimal Precision Score", col("Discrepancy Scores")[1]) \
                   .withColumn("Leading Zero Score", col("Discrepancy Scores")[2]) \
                   .withColumn("Thousands Separator Score", col("Discrepancy Scores")[3]) \
                   .withColumn("Negative vs Positive Score", col("Discrepancy Scores")[4]) \
                   .withColumn("Scientific Notation Score", col("Discrepancy Scores")[5]) \
                   .withColumn("Currency Symbol Score", col("Discrepancy Scores")[6]) \
                   .drop("Discrepancy Scores")

# Save the processed dataset in CSV format
csv_output_path = "Scored_Number_Based_Discrepancies.csv"
df_spark.toPandas().to_csv(csv_output_path, index=False, encoding="utf-8")

print(f"Scored dataset saved to: {csv_output_path}")


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, IntegerType
import re
import math
from decimal import Decimal

# Initialize Spark Session
spark = SparkSession.builder.appName("AutomaticNumberDiscrepancyAnalysis").getOrCreate()

# Define function to automatically detect discrepancies
def detect_number_discrepancies(source, destination):
    try:
        # Remove non-numeric characters (except negatives, decimals, 'E' for scientific notation)
        source_clean = re.sub(r"[^\d\.\-E]", "", source)
        destination_clean = re.sub(r"[^\d\.\-E]", "", destination)

        try:
            source_num = float(source_clean)
            destination_num = float(destination_clean)
        except ValueError:
            return [0] * 7  # If conversion fails, return zero for all categories

        # Initialize all scores to 0
        scores = [0] * 7

        # 1. Rounded Off Numbers
        if "E" not in source and "E" not in destination:
            scores[0] = 100 if abs(source_num - destination_num) < 1 else 0

        # 2. Decimal Precision Difference
        source_str = str(Decimal(source_num).normalize())  # Normalize decimal representation
        destination_str = str(Decimal(destination_num).normalize())
        if int(float(source_str)) == int(float(destination_str)):  # Same integer part
            precision_difference = abs(len(source_str.split('.')[1]) - len(destination_str.split('.')[1])) \
                if '.' in source_str and '.' in destination_str else 0
            scores[1] = max(0, 100 - precision_difference * 10)  # Deduct 10 points per extra decimal

        # 3. Leading Zero Issue
        scores[2] = 100 if source_clean.lstrip("0") == destination_clean.lstrip("0") else 0

        # 4. Thousands Separator Difference
        scores[3] = 100 if source_clean.replace(",", "").replace(" ", "") == destination_clean.replace(",", "").replace(" ", "") else 0

        # 5. Negative vs Positive
        if source_num != 0 and destination_num != 0:  # Exclude zero cases
            if math.isclose(abs(source_num), abs(destination_num), rel_tol=1e-9) and source_num != destination_num:
                scores[4] = 100

        # 6. Scientific Notation Difference
        if abs(source_num - destination_num) < 5:  # Allow small differences due to floating point representation
            scores[5] = 100

        # 7. Currency Symbol Difference (if the numeric value is identical)
        scores[6] = 100 if source_clean == destination_clean else 0

        return scores

    except Exception:
        return [0] * 7  # Default low scores if an error occurs

# Register UDF in Spark
discrepancy_udf = udf(lambda src, dst: detect_number_discrepancies(src, dst), ArrayType(IntegerType()))

# Load dataset
file_path = "/content/Number_Based_Discrepancies_Fixed.csv"
df_spark = spark.read.csv(file_path, header=True, inferSchema=True)

# Apply the automatic discrepancy detection function
df_spark = df_spark.withColumn("Discrepancy Scores", discrepancy_udf(col("Source"), col("Destination")))

# Extract individual discrepancy columns
df_spark = df_spark.withColumn("Rounded Off Score", col("Discrepancy Scores")[0]) \
                   .withColumn("Decimal Precision Score", col("Discrepancy Scores")[1]) \
                   .withColumn("Leading Zero Score", col("Discrepancy Scores")[2]) \
                   .withColumn("Thousands Separator Score", col("Discrepancy Scores")[3]) \
                   .withColumn("Negative vs Positive Score", col("Discrepancy Scores")[4]) \
                   .withColumn("Scientific Notation Score", col("Discrepancy Scores")[5]) \
                   .withColumn("Currency Symbol Score", col("Discrepancy Scores")[6]) \
                   .drop("Discrepancy Scores")

# Save the processed dataset in CSV format
csv_output_path = "Scored_Number_Based_Discrepancies.csv"
df_spark.toPandas().to_csv(csv_output_path, index=False, encoding="utf-8")

print(f"Scored dataset saved to: {csv_output_path}")


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import re
import math
from decimal import Decimal
# Initialize Spark Session
spark = SparkSession.builder.appName("AutomaticNumberDiscrepancyAnalysis").getOrCreate()
# Define function to automatically detect the most relevant discrepancy
def detect_primary_discrepancy(source, destination):
    """
    Detects the most relevant discrepancy and assigns a score to only one category.
    """
    try:
        # Remove non-numeric characters (except negatives, decimals, 'E' for scientific notation)
        source_clean = re.sub(r"[^\d\.\-E]", "", source)
        destination_clean = re.sub(r"[^\d\.\-E]", "", destination)
        try:
            source_num = float(source_clean)
            destination_num = float(destination_clean)
        except ValueError:
            return [0] * 7  # If conversion fails, return zero for all categories
        # Initialize all scores to 0
        scores = [0] * 7
        # Hierarchical priority order for discrepancy detection
        if "E" not in source and "E" not in destination and abs(source_num - destination_num) < 1:
            scores[0] = 100  # Rounded Off Numbers
        elif int(float(source_clean)) == int(float(destination_clean)):
            scores[1] = 100  # Decimal Precision Difference
        elif source_clean.lstrip("0") == destination_clean.lstrip("0"):
            scores[2] = 100  # Leading Zero Issue
        elif source_clean.replace(",", "").replace(" ", "") == destination_clean.replace(",", "").replace(" ", ""):
            scores[3] = 100  # Thousands Separator Difference
        elif source_num != 0 and destination_num != 0 and math.isclose(abs(source_num), abs(destination_num), rel_tol=1e-9) and source_num != destination_num:
            scores[4] = 100  # Negative vs Positive
        elif abs(source_num - destination_num) < 5:
            scores[5] = 100  # Scientific Notation Difference
        elif source_clean == destination_clean:
            scores[6] = 100  # Currency Symbol Difference
        return scores
    except Exception:
        return [0] * 7  # Default low scores if an error occurs
# Register UDF in Spark
discrepancy_udf = udf(lambda src, dst: detect_primary_discrepancy(src, dst), ArrayType(IntegerType()))
# Load dataset
file_path = "/content/Number_Based_Discrepancies_Fixed.csv"
df_spark = spark.read.csv(file_path, header=True, inferSchema=True)
# Apply the automatic discrepancy detection function
df_spark = df_spark.withColumn("Discrepancy Scores", discrepancy_udf(col("Source"), col("Destination")))
# Extract individual discrepancy columns
df_spark = df_spark.withColumn("Rounded Off Score", col("Discrepancy Scores")[0]) \
                   .withColumn("Decimal Precision Score", col("Discrepancy Scores")[1]) \
                   .withColumn("Leading Zero Score", col("Discrepancy Scores")[2]) \
                   .withColumn("Thousands Separator Score", col("Discrepancy Scores")[3]) \
                   .withColumn("Negative vs Positive Score", col("Discrepancy Scores")[4]) \
                   .withColumn("Scientific Notation Score", col("Discrepancy Scores")[5]) \
                   .withColumn("Currency Symbol Score", col("Discrepancy Scores")[6]) \
                   .drop("Discrepancy Scores")
# Save the processed dataset in CSV format
csv_output_path = "Scored_Number_Based_Discrepancies.csv"
df_spark.toPandas().to_csv(csv_output_path, index=False, encoding="utf-8")
print(f"Scored dataset saved to: {csv_output_path}")


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Start Spark Session
spark = SparkSession.builder.appName("MixedDataClassification").getOrCreate()

# Load Data
df = spark.read.csv("/content/Number_Based_Discrepancies_Fixed.csv", header=True, inferSchema=True)


In [None]:

# Identify Categorical & Numeric Features
categorical_columns = [col for col, dtype in df.dtypes if dtype == 'string']
numeric_columns = [col for col, dtype in df.dtypes if dtype in ['int', 'double']]

# Encode Categorical Features
indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed").fit(df) for col in categorical_columns]
for indexer in indexers:
    df = indexer.transform(df)


In [None]:
# Assemble Features
feature_columns = [col+"_indexed" for col in categorical_columns] + numeric_columns
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

In [None]:
df.show()

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
# Encode Target Variable
label_indexer = StringIndexer(inputCol="Label", outputCol="label").fit(df)
df = label_indexer.transform(df)

# Train-Test Split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Train Model (Random Forest)
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label", maxBins=7000)
model = dt.fit(train_df)

# Predict
predictions = model.transform(test_df)

# Evaluate
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)


In [None]:
pip install pytorch_tabnet

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
import numpy as np
from pyspark.sql import SparkSession

# Start PySpark Session
spark = SparkSession.builder.appName("TabNetRawData").getOrCreate()

# Load Dataset (without conversion)
df = spark.read.csv("/content/Number_Based_Discrepancies_Fixed.csv", header=True, inferSchema=True)

# Convert Spark DataFrame to Pandas (TabNet doesn't support Spark directly)
df_pd = df.toPandas()

# Split Data
X = df_pd.drop(columns=['Label'])
y = df_pd['Label']

# Train TabNet Model (directly on raw data)
tabnet = TabNetClassifier()
tabnet.fit(X.to_numpy(), y.to_numpy(), eval_set=[(X.to_numpy(), y.to_numpy())], max_epochs=10)

# Predict and Evaluate
y_pred = tabnet.predict(X.to_numpy())
accuracy = (y_pred == y).mean()

print("Accuracy:", accuracy)


In [None]:
from transformers import FTTransformer
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load dataset (without preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keeping raw 'Source' & 'Destination'
y = df['Label']

# Encode target (required for deep learning models)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Convert categorical + numerical features automatically
model = FTTransformer(d_numerical=X.shape[1], categories=[X[col].nunique() for col in X.select_dtypes('object').columns])

# Train directly on raw data
model.fit(X.to_numpy(), y.to_numpy())

# Predict & Evaluate
y_pred = model.predict(X.to_numpy())
accuracy = accuracy_score(y, y_pred)

print("FT-Transformer Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load dataset (no preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keeping raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Convert to Tensor
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define a Simple Neural Network
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Initialize Model
input_dim = X_train.shape[1]
output_dim = len(label_encoder.classes_)
model = MLP(input_dim, output_dim)

# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train Model
epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Neural Network Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load dataset (without preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keeping raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Convert categorical data inside PyTorch model (instead of preprocessing manually)
class MixedDataMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MixedDataMLP, self).__init__()
        self.embedding = nn.EmbeddingBag(10000, 64, mode="mean")  # Handles categorical text
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = self.embedding(x.long())  # Convert text/numeric mix to embeddings
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Convert Data to Tensors (Handle Text and Numbers Inside Model)
X_train_tensor = torch.tensor(X_train.astype(str).applymap(hash).values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype(str).applymap(hash).values, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Initialize Model
input_dim = X_train.shape[1]
output_dim = len(label_encoder.classes_)
model = MixedDataMLP(input_dim, output_dim)

# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train Model
epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Neural Network Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load dataset (without preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keeping raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define Hashing Function (Keeps Values Within Embedding Range)
num_buckets = 10000  # Ensure hash values fit within embedding layer
def safe_hash(value):
    return hash(value) % num_buckets

# Convert Data to Tensors Using Safe Hashing
X_train_tensor = torch.tensor(X_train.astype(str).map(safe_hash).values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype(str).map(safe_hash).values, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define a Model That Accepts Raw Categorical Data
class MixedDataMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MixedDataMLP, self).__init__()
        self.embedding = nn.EmbeddingBag(num_buckets, 64, mode="mean")  # Handles categorical text
        self.fc1 = nn.Linear(64, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = self.embedding(x.long())  # Convert text/numeric mix to embeddings
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Initialize Model
input_dim = X_train.shape[1]
output_dim = len(label_encoder.classes_)
model = MixedDataMLP(input_dim, output_dim)

# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train Model
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Neural Network Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load dataset (without preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keeping raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define Hashing Function (Keeps Values Within Embedding Range)
num_buckets = 50000  # Increased bucket size for more unique hashed values
def safe_hash(value):
    return hash(value) % num_buckets  # Keep values in range [0, num_buckets)

# Convert Data to Tensors Using Safe Hashing
X_train_tensor = torch.tensor(X_train.astype(str).map(safe_hash).values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype(str).map(safe_hash).values, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define an Improved Deep Neural Network
class ImprovedMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ImprovedMLP, self).__init__()
        self.embedding = nn.EmbeddingBag(num_buckets, 128, mode="mean")  # Increased embedding size
        self.fc1 = nn.Linear(128, 256)  # More neurons
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)  # Dropout layer to prevent overfitting

    def forward(self, x):
        x = self.embedding(x.long())  # Hash-based embeddings
        x = self.dropout(self.relu(self.fc1(x)))  # Apply dropout
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.relu(self.fc3(x))
        return self.fc4(x)

# Initialize Model
input_dim = X_train.shape[1]
output_dim = len(label_encoder.classes_)
model = ImprovedMLP(input_dim, output_dim)

# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Lower learning rate for stability

# Train Model
epochs = 200  # More epochs for better learning
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:  # Print loss every 10 epochs
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("\n🚀 Improved Neural Network Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score

# Load dataset (No preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keep raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Compute Class Weights (for imbalance handling)
class_weights = compute_class_weight("balanced", classes=np.unique(y_encoded), y=y_encoded)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

# Convert Data to Tensor (Use categorical encoding)
X_train_tensor = torch.tensor(X_train.astype(str).applymap(hash).values % 50000, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype(str).applymap(hash).values % 50000, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define a More Advanced Neural Network with Learned Embeddings
class EmbeddingMLP(nn.Module):
    def __init__(self, num_categories, output_dim):
        super(EmbeddingMLP, self).__init__()
        self.embedding = nn.Embedding(num_categories, 128)  # Learn feature relationships
        self.fc1 = nn.Linear(128, 256)
        self.bn1 = nn.BatchNorm1d(256)  # Batch normalization
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.relu(self.fc3(x))
        return self.fc4(x)

# Initialize Model
num_categories = 50000  # Embedding size
output_dim = len(label_encoder.classes_)
model = EmbeddingMLP(num_categories, output_dim)

# Define Loss (Using Class Weights) and Optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005)  # AdamW for better weight decay

# Train Model
epochs = 300  # Train for longer
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:  # Print every 10 epochs
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("\n🚀 Final Improved Neural Network Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score

# Load dataset (No preprocessing)
df = pd.read_csv("Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keep raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Compute Class Weights (for imbalance handling)
class_weights = compute_class_weight("balanced", classes=np.unique(y_encoded), y=y_encoded)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

# Convert Data to Tensor (Use categorical encoding)
num_buckets = 50000  # Embedding size
def safe_hash(value):
    return hash(value) % num_buckets  # Keep values within embedding range

X_train_tensor = torch.tensor(X_train.astype(str).map(safe_hash).values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype(str).map(safe_hash).values, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define an Improved Deep Neural Network with Learned Embeddings
class EmbeddingMLP(nn.Module):
    def __init__(self, num_categories, output_dim):
        super(EmbeddingMLP, self).__init__()
        self.embedding = nn.Embedding(num_categories, 128)  # Learn feature relationships
        self.fc1 = nn.Linear(128 * X_train.shape[1], 256)  # Adjust input size after flattening
        self.bn1 = nn.BatchNorm1d(256)  # Batch normalization
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)  # Flatten embedding output
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.relu(self.fc3(x))
        return self.fc4(x)

# Initialize Model
output_dim = len(label_encoder.classes_)
model = EmbeddingMLP(num_buckets, output_dim)

# Define Loss (Using Class Weights) and Optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005)  # AdamW for better weight decay

# Train Model
epochs = 300  # Train for longer
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:  # Print every 10 epochs
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("\n🚀 Final Improved Neural Network Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score

# Load dataset (No preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keep raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Compute Class Weights (for imbalance handling)
class_weights = compute_class_weight("balanced", classes=np.unique(y_encoded), y=y_encoded)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

# Convert Data to Tensor (Use categorical encoding)
num_buckets = 50000  # Embedding size
def safe_hash(value):
    return hash(value) % num_buckets  # Keep values within embedding range

X_train_tensor = torch.tensor(X_train.astype(str).map(safe_hash).values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype(str).map(safe_hash).values, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define an Improved Deep Neural Network with Learned Embeddings
class EmbeddingMLP(nn.Module):
    def __init__(self, num_categories, output_dim):
        super(EmbeddingMLP, self).__init__()
        self.embedding = nn.Embedding(num_categories, 128)  # Learn feature relationships
        self.fc1 = nn.Linear(128 * X_train.shape[1], 256)  # Adjust input size after flattening
        self.bn1 = nn.BatchNorm1d(256)  # Batch normalization
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)  # Flatten embedding output
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.relu(self.fc3(x))
        return self.fc4(x)

# Initialize Model
output_dim = len(label_encoder.classes_)
model = EmbeddingMLP(num_buckets, output_dim)

# Define Loss (Using Class Weights) and Optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005)  # AdamW for better weight decay

# Train Model
epochs = 300  # Train for longer
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:  # Print every 10 epochs
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("\n🚀 Final Improved Neural Network Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score

# Load dataset (No preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keep raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Compute Class Weights (for imbalance handling)
class_weights = compute_class_weight("balanced", classes=np.unique(y_encoded), y=y_encoded)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

# Convert Data to Tensor (Use categorical encoding)
num_buckets = 50000  # Embedding size
def safe_hash(value):
    return hash(value) % num_buckets  # Keep values within embedding range

X_train_tensor = torch.tensor(X_train.astype(str).map(safe_hash).values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype(str).map(safe_hash).values, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define a Transformer-Based Model
class TransformerModel(nn.Module):
    def __init__(self, num_categories, output_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(num_categories, 256)  # Larger Embeddings
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=256, nhead=8)  # Multi-Head Attention
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=4)
        self.fc1 = nn.Linear(256, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.leakyrelu = nn.LeakyReLU()
        self.dropout = nn.Dropout(0.2)  # Lower dropout for better generalization

    def forward(self, x):
        x = self.embedding(x).view(x.size(0), 1, -1)  # Adjust for Transformer
        x = self.transformer(x).squeeze(1)  # Pass through Transformer
        x = self.leakyrelu(self.bn1(self.fc1(x)))
        x = self.leakyrelu(self.fc2(x))
        return self.fc3(x)

# Initialize Transformer Model
output_dim = len(label_encoder.classes_)
model = TransformerModel(num_buckets, output_dim)

# Define Optimizer & Loss
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)  # Use SGD + Momentum for better generalization

# Train Model
epochs = 500  # Train longer for Transformers
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:  # Print every 10 epochs
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("\n🚀 Final Transformer Model Accuracy:", accuracy)


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score

# Load dataset (No preprocessing)
df = pd.read_csv("/content/Number_Based_Discrepancies_Fixed.csv")

# Separate features and target
X = df.drop(columns=['Label'])  # Keep raw 'Source' & 'Destination'
y = df['Label']

# Encode categorical target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Compute Class Weights (for imbalance handling)
class_weights = compute_class_weight("balanced", classes=np.unique(y_encoded), y=y_encoded)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

# Convert Data to Tensor (Use categorical encoding)
num_buckets = 50000  # Embedding size
def safe_hash(value):
    return hash(value) % num_buckets  # Keep values within embedding range

X_train_tensor = torch.tensor(X_train.astype(str).map(safe_hash).values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.astype(str).map(safe_hash).values, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define a Transformer-Based Model (FIXED)
class TransformerModel(nn.Module):
    def __init__(self, num_categories, output_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(num_categories, 256)  # Embed categorical features
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=256, nhead=8, batch_first=True)  # Use batch_first=True
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=4)
        self.fc1 = nn.Linear(256, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.leakyrelu = nn.LeakyReLU()
        self.dropout = nn.Dropout(0.2)  # Lower dropout for better generalization

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, seq_length, embed_dim)
        x = self.transformer(x)  # Transformer expects (batch_size, seq_len, embed_dim)
        x = x.mean(dim=1)  # Pool across sequence length
        x = self.leakyrelu(self.bn1(self.fc1(x)))
        x = self.leakyrelu(self.fc2(x))
        return self.fc3(x)

# Initialize Transformer Model
output_dim = len(label_encoder.classes_)
model = TransformerModel(num_buckets, output_dim)

# Define Optimizer & Loss
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)  # Use SGD + Momentum for better generalization

# Train Model
epochs = 500  # Train longer for Transformers
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:  # Print every 10 epochs
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Predict
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_tensor, axis=1).numpy()

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("\n Final Transformer Model Accuracy:", accuracy)
