In [4]:
from rapidfuzz import fuzz

# Define string pairs
string_pairs = [
    ("abc", "abc"),
    ("abc", "ABC"),
    ("ABC", "ABC"),
    ("abC", "Abc")
]

# Compute different fuzzy matching scores
for str1, str2 in string_pairs:
    print(f"Comparing: '{str1}' vs. '{str2}'")
    print(f"  Fuzz Ratio: {fuzz.ratio(str1, str2)}")
    print(f"  Fuzz Partial Ratio: {fuzz.partial_ratio(str1, str2)}")
    print(f"  Fuzz Token Sort Ratio: {fuzz.token_sort_ratio(str1, str2)}")
    print(f"  Fuzz Token Set Ratio: {fuzz.token_set_ratio(str1, str2)}")
    print("-" * 50)


Comparing: 'abc' vs. 'abc'
  Fuzz Ratio: 100.0
  Fuzz Partial Ratio: 100.0
  Fuzz Token Sort Ratio: 100.0
  Fuzz Token Set Ratio: 100.0
--------------------------------------------------
Comparing: 'abc' vs. 'ABC'
  Fuzz Ratio: 0.0
  Fuzz Partial Ratio: 0.0
  Fuzz Token Sort Ratio: 0.0
  Fuzz Token Set Ratio: 0.0
--------------------------------------------------
Comparing: 'ABC' vs. 'ABC'
  Fuzz Ratio: 100.0
  Fuzz Partial Ratio: 100.0
  Fuzz Token Sort Ratio: 100.0
  Fuzz Token Set Ratio: 100.0
--------------------------------------------------
Comparing: 'abC' vs. 'Abc'
  Fuzz Ratio: 33.333333333333336
  Fuzz Partial Ratio: 40.0
  Fuzz Token Sort Ratio: 33.333333333333336
  Fuzz Token Set Ratio: 33.33333333333333
--------------------------------------------------


In [5]:
from rapidfuzz import fuzz

def detect_case_sensitivity(str1, str2, threshold=10):
    """
    Determines if a record is case-sensitive based on fuzzy matching scores.

    :param str1: First input string
    :param str2: Second input string
    :param threshold: Difference threshold to classify case sensitivity
    :return: Dictionary with case sensitivity classification
    """
    # Compute fuzzy scores with case sensitivity
    case_sensitive_score = fuzz.ratio(str1, str2)

    # Compute fuzzy scores without case sensitivity (convert to lowercase)
    case_insensitive_score = fuzz.ratio(str1.lower(), str2.lower())

    # Compute the difference
    score_difference = abs(case_sensitive_score - case_insensitive_score)

    # If difference is above a threshold, it's case-sensitive
    is_case_sensitive = score_difference > threshold

    return {
        "String 1": str1,
        "String 2": str2,
        "Case-Sensitive Score": case_sensitive_score,
        "Case-Insensitive Score": case_insensitive_score,
        "Score Difference": score_difference,
        "Case Sensitivity Detected": is_case_sensitive
    }

# Define test cases
string_pairs = [
    ("abc", "abc"),
    ("abc", "ABC"),
    ("ABC", "ABC"),
    ("abC", "Abc"),
    ("DataScience", "Datascience"),
    ("John Doe", "john doe")
]

# Run the detection
for str1, str2 in string_pairs:
    result = detect_case_sensitivity(str1, str2)
    print(f"Comparing: '{str1}' vs. '{str2}'")
    print(f"  Case-Sensitive Score: {result['Case-Sensitive Score']}")
    print(f"  Case-Insensitive Score: {result['Case-Insensitive Score']}")
    print(f"  Score Difference: {result['Score Difference']}")
    print(f"  Case Sensitivity Detected: {result['Case Sensitivity Detected']}")
    print("-" * 50)


Comparing: 'abc' vs. 'abc'
  Case-Sensitive Score: 100.0
  Case-Insensitive Score: 100.0
  Score Difference: 0.0
  Case Sensitivity Detected: False
--------------------------------------------------
Comparing: 'abc' vs. 'ABC'
  Case-Sensitive Score: 0.0
  Case-Insensitive Score: 100.0
  Score Difference: 100.0
  Case Sensitivity Detected: True
--------------------------------------------------
Comparing: 'ABC' vs. 'ABC'
  Case-Sensitive Score: 100.0
  Case-Insensitive Score: 100.0
  Score Difference: 0.0
  Case Sensitivity Detected: False
--------------------------------------------------
Comparing: 'abC' vs. 'Abc'
  Case-Sensitive Score: 33.333333333333336
  Case-Insensitive Score: 100.0
  Score Difference: 66.66666666666666
  Case Sensitivity Detected: True
--------------------------------------------------
Comparing: 'DataScience' vs. 'Datascience'
  Case-Sensitive Score: 90.9090909090909
  Case-Insensitive Score: 100.0
  Score Difference: 9.090909090909093
  Case Sensitivity Detect

In [6]:
import re
from rapidfuzz import fuzz

def preprocess_text(text, mode):
    """
    Apply different types of text normalization based on mode.
    """
    if mode == "lowercase":
        return text.lower()
    elif mode == "remove_punctuation":
        return re.sub(r'[^\w\s]', '', text)
    elif mode == "normalize_spaces":
        return re.sub(r'\s+', ' ', text).strip()
    elif mode == "expand_abbreviations":
        abbreviations = {"corp.": "corporation", "ltd.": "limited", "inc.": "incorporated"}
        words = text.split()
        expanded = [abbreviations.get(word.lower(), word) for word in words]
        return " ".join(expanded)
    return text

def classify_difference(str1, str2):
    """
    Classify the type of difference between two strings.
    """
    categories = {
        "Case Sensitivity": preprocess_text(str1, "lowercase"),
        "Special Character Differences": preprocess_text(str1, "remove_punctuation"),
        "Extra Space Issues": preprocess_text(str1, "normalize_spaces"),
        "Abbreviation vs. Full Form": preprocess_text(str1, "expand_abbreviations"),
    }

    differences = {}
    for category, transformed_str in categories.items():
        score = fuzz.ratio(transformed_str, str2)
        differences[category] = score

    # Identify the category with the highest impact (lowest score means most difference)
    classified_category = min(differences, key=differences.get)

    return {"String 1": str1, "String 2": str2, "Classification": classified_category, "Scores": differences}

# Test cases
string_pairs = [
    ("Corp.", "Corporation"),  # Abbreviation
    ("abc", "ABC"),  # Case Sensitivity
    ("AT&T", "ATT"),  # Special Character Differences
    ("XYZ Ltd.", "XYZ    Ltd.")  # Extra Spaces
]

# Run classification on test cases
for str1, str2 in string_pairs:
    result = classify_difference(str1, str2)
    print(f"Comparing: '{str1}' vs. '{str2}'")
    print(f"  Classified Difference: {result['Classification']}")
    print(f"  Score Breakdown: {result['Scores']}")
    print("-" * 50)


Comparing: 'Corp.' vs. 'Corporation'
  Classified Difference: Case Sensitivity
  Score Breakdown: {'Case Sensitivity': 37.5, 'Special Character Differences': 53.333333333333336, 'Extra Space Issues': 50.0, 'Abbreviation vs. Full Form': 90.9090909090909}
--------------------------------------------------
Comparing: 'abc' vs. 'ABC'
  Classified Difference: Case Sensitivity
  Score Breakdown: {'Case Sensitivity': 0.0, 'Special Character Differences': 0.0, 'Extra Space Issues': 0.0, 'Abbreviation vs. Full Form': 0.0}
--------------------------------------------------
Comparing: 'AT&T' vs. 'ATT'
  Classified Difference: Case Sensitivity
  Score Breakdown: {'Case Sensitivity': 0.0, 'Special Character Differences': 100.0, 'Extra Space Issues': 85.71428571428572, 'Abbreviation vs. Full Form': 85.71428571428572}
--------------------------------------------------
Comparing: 'XYZ Ltd.' vs. 'XYZ    Ltd.'
  Classified Difference: Case Sensitivity
  Score Breakdown: {'Case Sensitivity': 42.105263157

In [7]:
import re
from rapidfuzz import fuzz

def classify_difference(str1, str2, threshold=10):
    """
    Classifies the type of difference between two strings.

    :param str1: First string
    :param str2: Second string
    :param threshold: Minimum score difference to classify a change
    :return: Classification and score breakdown
    """
    # Compute original fuzzy score
    original_score = fuzz.ratio(str1, str2)

    # Compute case-insensitive fuzzy score
    case_insensitive_score = fuzz.ratio(str1.lower(), str2.lower())
    case_sensitivity_diff = original_score - case_insensitive_score

    # Compute score after removing special characters
    str1_clean = re.sub(r'[^\w\s]', '', str1)
    str2_clean = re.sub(r'[^\w\s]', '', str2)
    special_char_score = fuzz.ratio(str1_clean, str2_clean)
    special_char_diff = original_score - special_char_score

    # Compute score after normalizing spaces
    str1_space_norm = ' '.join(str1.split())
    str2_space_norm = ' '.join(str2.split())
    space_score = fuzz.ratio(str1_space_norm, str2_space_norm)
    space_diff = original_score - space_score

    # Compute token set ratio for abbreviation detection
    token_score = fuzz.token_set_ratio(str1, str2)
    abbreviation_diff = original_score - token_score

    # Determine the major difference
    max_diff = max(abs(case_sensitivity_diff), abs(special_char_diff), abs(space_diff), abs(abbreviation_diff))

    if abs(case_sensitivity_diff) > threshold:
        classified_difference = "Case Sensitivity"
    elif abs(special_char_diff) > threshold:
        classified_difference = "Special Character Differences"
    elif abs(space_diff) > threshold:
        classified_difference = "Extra Space Issues"
    elif abs(abbreviation_diff) > threshold:
        classified_difference = "Abbreviation vs. Full Form"
    else:
        classified_difference = "No Significant Difference"

    # Score breakdown
    score_breakdown = {
        "Case Sensitivity": abs(case_sensitivity_diff),
        "Special Character Differences": abs(special_char_diff),
        "Extra Space Issues": abs(space_diff),
        "Abbreviation vs. Full Form": abs(abbreviation_diff)
    }

    return classified_difference, score_breakdown

# Test cases
string_pairs = [
    ("abc", "ABC"),
    ("AT&T", "ATT"),
    ("XYZ Ltd.", "XYZ    Ltd."),
    ("Corp.", "Corporation"),
    ("Data-Science", "Data Science"),
    ("Hello  World", "Hello World")
]

# Run classification
for str1, str2 in string_pairs:
    classified_diff, score_breakdown = classify_difference(str1, str2)
    print(f"Comparing: '{str1}' vs. '{str2}'")
    print(f"  Classified Difference: {classified_diff}")
    print(f"  Score Breakdown: {score_breakdown}")
    print("-" * 50)


Comparing: 'abc' vs. 'ABC'
  Classified Difference: Case Sensitivity
  Score Breakdown: {'Case Sensitivity': 100.0, 'Special Character Differences': 0.0, 'Extra Space Issues': 0.0, 'Abbreviation vs. Full Form': 0.0}
--------------------------------------------------
Comparing: 'AT&T' vs. 'ATT'
  Classified Difference: Special Character Differences
  Score Breakdown: {'Case Sensitivity': 0.0, 'Special Character Differences': 14.285714285714278, 'Extra Space Issues': 0.0, 'Abbreviation vs. Full Form': 1.4210854715202004e-14}
--------------------------------------------------
Comparing: 'XYZ Ltd.' vs. 'XYZ    Ltd.'
  Classified Difference: Extra Space Issues
  Score Breakdown: {'Case Sensitivity': 0.0, 'Special Character Differences': 1.8575851393188856, 'Extra Space Issues': 15.789473684210535, 'Abbreviation vs. Full Form': 15.789473684210535}
--------------------------------------------------
Comparing: 'Corp.' vs. 'Corporation'
  Classified Difference: No Significant Difference
  Score

In [8]:
import re
from rapidfuzz import fuzz
import pandas as pd

def classify_difference(str1, str2, threshold=10):
    """
    Classifies the type of difference between two strings and prints scores.

    :param str1: First string
    :param str2: Second string
    :param threshold: Minimum score difference to classify a change
    :return: Classification and score breakdown
    """
    # Compute original (case-sensitive) fuzzy score
    case_sensitive_score = fuzz.ratio(str1, str2)

    # Compute case-insensitive fuzzy score
    case_insensitive_score = fuzz.ratio(str1.lower(), str2.lower())
    case_sensitivity_diff = abs(case_sensitive_score - case_insensitive_score)

    # Compute score after removing special characters
    str1_clean = re.sub(r'[^\w\s]', '', str1)
    str2_clean = re.sub(r'[^\w\s]', '', str2)
    special_char_score = fuzz.ratio(str1_clean, str2_clean)
    special_char_diff = abs(case_sensitive_score - special_char_score)

    # Compute score after normalizing spaces
    str1_space_norm = ' '.join(str1.split())
    str2_space_norm = ' '.join(str2.split())
    space_score = fuzz.ratio(str1_space_norm, str2_space_norm)
    space_diff = abs(case_sensitive_score - space_score)

    # Compute token set ratio for abbreviation detection
    token_score = fuzz.token_set_ratio(str1, str2)
    abbreviation_diff = abs(case_sensitive_score - token_score)

    # Determine the major difference
    max_diff = max(case_sensitivity_diff, special_char_diff, space_diff, abbreviation_diff)

    if case_sensitivity_diff > threshold:
        classified_difference = "Case Sensitivity"
    elif special_char_diff > threshold:
        classified_difference = "Special Character Differences"
    elif space_diff > threshold:
        classified_difference = "Extra Space Issues"
    elif abbreviation_diff > threshold:
        classified_difference = "Abbreviation vs. Full Form"
    else:
        classified_difference = "No Significant Difference"

    # Score breakdown
    score_breakdown = {
        "Case Sensitivity": case_sensitivity_diff,
        "Special Character Differences": special_char_diff,
        "Extra Space Issues": space_diff,
        "Abbreviation vs. Full Form": abbreviation_diff
    }

    return classified_difference, case_sensitive_score, case_insensitive_score, score_breakdown

# Test cases
string_pairs = [
    ("abc", "ABC"),
    ("AT&T", "ATT"),
    ("XYZ Ltd.", "XYZ    Ltd."),
    ("Corp.", "Corporation"),
    ("Data-Science", "Data Science"),
    ("Hello  World", "Hello World")
]

# Collect results
results = []
for str1, str2 in string_pairs:
    classified_diff, case_sensitive_score, case_insensitive_score, score_breakdown = classify_difference(str1, str2)
    results.append({
        "String 1": str1,
        "String 2": str2,
        "Case-Sensitive Score": case_sensitive_score,
        "Case-Insensitive Score": case_insensitive_score,
        "Classified Difference": classified_diff,
        **score_breakdown
    })

# Convert results to DataFrame and display
df_results = pd.DataFrame(results)
print(df_results)


       String 1      String 2  Case-Sensitive Score  Case-Insensitive Score  \
0           abc           ABC              0.000000              100.000000   
1          AT&T           ATT             85.714286               85.714286   
2      XYZ Ltd.   XYZ    Ltd.             84.210526               84.210526   
3         Corp.   Corporation             50.000000               50.000000   
4  Data-Science  Data Science             91.666667               91.666667   
5  Hello  World   Hello World             95.652174               95.652174   

           Classified Difference  Case Sensitivity  \
0               Case Sensitivity             100.0   
1  Special Character Differences               0.0   
2             Extra Space Issues               0.0   
3      No Significant Difference               0.0   
4      No Significant Difference               0.0   
5      No Significant Difference               0.0   

   Special Character Differences  Extra Space Issues  \
0          

In [9]:
import re
from rapidfuzz import fuzz
import pandas as pd

def classify_difference(str1, str2, threshold=10):
    """
    Classifies the type of difference between two strings and prints scores.

    :param str1: First string
    :param str2: Second string
    :param threshold: Minimum score difference to classify a change
    :return: Classification and score breakdown
    """
    # Compute original (case-sensitive) fuzzy score
    case_sensitive_score = fuzz.ratio(str1, str2)

    # Compute case-insensitive fuzzy score
    case_insensitive_score = fuzz.ratio(str1.lower(), str2.lower())
    case_sensitivity_diff = abs(case_sensitive_score - case_insensitive_score)

    # Compute score after removing special characters
    str1_clean = re.sub(r'[^\w\s]', '', str1)
    str2_clean = re.sub(r'[^\w\s]', '', str2)
    special_char_score = fuzz.ratio(str1_clean, str2_clean)
    special_char_diff = abs(case_sensitive_score - special_char_score)

    # Compute score after normalizing spaces
    str1_space_norm = ' '.join(str1.split())
    str2_space_norm = ' '.join(str2.split())
    space_score = fuzz.ratio(str1_space_norm, str2_space_norm)
    space_diff = abs(case_sensitive_score - space_score)

    # Compute token set ratio for abbreviation detection
    token_score = fuzz.token_set_ratio(str1, str2)
    abbreviation_diff = abs(case_sensitive_score - token_score)

    # Determine the major difference
    max_diff = max(case_sensitivity_diff, special_char_diff, space_diff, abbreviation_diff)

    if case_sensitivity_diff > threshold:
        classified_difference = "Case Sensitivity"
    elif special_char_diff > threshold:
        classified_difference = "Special Character Differences"
    elif space_diff > threshold:
        classified_difference = "Extra Space Issues"
    elif abbreviation_diff > threshold:
        classified_difference = "Abbreviation vs. Full Form"
    else:
        classified_difference = "No Significant Difference"

    # Score breakdown
    score_breakdown = {
        "Case Sensitivity": case_sensitivity_diff,
        "Special Character Differences": special_char_diff,
        "Extra Space Issues": space_diff,
        "Abbreviation vs. Full Form": abbreviation_diff
    }

    return classified_difference, case_sensitive_score, case_insensitive_score, score_breakdown

# Test cases
string_pairs = [
    ("abc", "ABC"),
    ("AT&T", "ATT"),
    ("XYZ Ltd.", "XYZ    Ltd."),
    ("Corp.", "Corporation"),
    ("Data-Science", "Data Science"),
    ("Hello  World", "Hello World")
]

# Collect results
results = []
for str1, str2 in string_pairs:
    classified_diff, case_sensitive_score, case_insensitive_score, score_breakdown = classify_difference(str1, str2)
    results.append({
        "String 1": str1,
        "String 2": str2,
        "Case-Sensitive Score": case_sensitive_score,
        "Case-Insensitive Score": case_insensitive_score,
        "Classified Difference": classified_diff,
        **score_breakdown
    })

# Convert results to DataFrame
df_results = pd.DataFrame(results)

# Save to Excel
file_path = "/content/sample_data/Fuzzy_Matching_Analysis.xlsx"
df_results.to_excel(file_path, index=False)

print(f"Excel file saved: {file_path}")


Excel file saved: /content/sample_data/Fuzzy_Matching_Analysis.xlsx


# New Section

In [10]:
pip install pyspark rapidfuzz openpyxl



In [35]:
import pandas as pd

# Load Excel file
df = pd.read_excel("/content/Dataset.xlsx")

# Save as CSV
df.to_csv("/content/Fuzzy_Matching_Dataset.csv", index=False)
print("Excel converted to CSV successfully!")


Excel converted to CSV successfully!


In [37]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, DoubleType
import re
from rapidfuzz import fuzz

# Initialize Spark session
spark = SparkSession.builder.appName("FuzzyMatching").getOrCreate()

# Load dataset
dataset_path = "/content/Fuzzy_Matching_Dataset.csv"
df_pyspark = spark.read.csv(dataset_path,header=True)



In [38]:
df_pyspark.show()

+--------------------+--------------------+----------------+
|              Source|         Destination|           Label|
+--------------------+--------------------+----------------+
|  Big Data Analytics|  BIG DATA ANALYTICS|Case Sensitivity|
|Business Intellig...|business intellig...|Case Sensitivity|
|  Big Data Analytics|  big data analytics|Case Sensitivity|
|  Geospatial Studies|  GEOSPATIAL STUDIES|Case Sensitivity|
|  Big Data Analytics|  BIG DATA ANALYTICS|Case Sensitivity|
|      Cyber Security|      cyber security|Case Sensitivity|
| University Research| UNIVERSITY RESEARCH|Case Sensitivity|
|    Machine Learning|    MACHINE LEARNING|Case Sensitivity|
|Artificial Intell...|artificial intell...|Case Sensitivity|
|        Data Science|        DATA SCIENCE|Case Sensitivity|
|  Geospatial Studies|  GEOSPATIAL STUDIES|Case Sensitivity|
|        Data Science|        DATA SCIENCE|Case Sensitivity|
|     Cloud Computing|     CLOUD COMPUTING|Case Sensitivity|
|Business Intellig...|BU

In [40]:

# Define fuzzy matching functions
def get_fuzzy_scores(str1, str2):
    """ Compute various fuzzy matching scores between two strings. """
    case_sensitive_score = fuzz.ratio(str1, str2)
    case_insensitive_score = fuzz.ratio(str1.lower(), str2.lower())
    case_sensitivity_diff = abs(case_sensitive_score - case_insensitive_score)

    str1_clean, str2_clean = re.sub(r'[^\w\s]', '', str1), re.sub(r'[^\w\s]', '', str2)
    special_char_score = fuzz.ratio(str1_clean, str2_clean)
    special_char_diff = abs(case_sensitive_score - special_char_score)

    str1_space_norm, str2_space_norm = ' '.join(str1.split()), ' '.join(str2.split())
    space_score = fuzz.ratio(str1_space_norm, str2_space_norm)
    space_diff = abs(case_sensitive_score - space_score)

    token_score = fuzz.token_set_ratio(str1, str2)
    abbreviation_diff = abs(case_sensitive_score - token_score)

    return (case_sensitive_score, case_insensitive_score, case_sensitivity_diff,
            special_char_diff, space_diff, abbreviation_diff)

# Register UDFs
fuzzy_udf = udf(lambda str1, str2: get_fuzzy_scores(str1, str2),
                "array<double>")

# Apply fuzzy matching functions
df_pyspark = df_pyspark.withColumn("Fuzzy Scores", fuzzy_udf(col("Source"), col("Destination")))

# Extract individual scores
df_pyspark = df_pyspark.withColumn("Case-Sensitive Score", col("Fuzzy Scores")[0]) \
                       .withColumn("Case-Insensitive Score", col("Fuzzy Scores")[1]) \
                       .withColumn("Case Sensitivity Diff", col("Fuzzy Scores")[2]) \
                       .withColumn("Special Character Diff", col("Fuzzy Scores")[3]) \
                       .withColumn("Extra Space Diff", col("Fuzzy Scores")[4]) \
                       .drop("Fuzzy Scores")
                      #  .withColumn("Abbreviation Diff", col("Fuzzy Scores")[5]) \


# Save results to an Excel file
output_path = "Scored_Fuzzy_Matching_Analysis.xlsx"
df_pyspark.toPandas().to_excel(output_path, index=False)

print(f"Scored dataset saved to: {output_path}")

Scored dataset saved to: Scored_Fuzzy_Matching_Analysis.xlsx


In [41]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, StringType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [42]:

# Load Excel file
df = pd.read_excel("/content/Scored_Fuzzy_Matching_Analysis.xlsx")

# Save as CSV
df.to_csv("/content/train.csv", index=False)
print("Excel converted to CSV successfully!")

Excel converted to CSV successfully!


In [43]:
# Load dataset
dataset_path = "/content/train.csv"
df_pyspark = spark.read.csv(dataset_path,header=True)

In [56]:
#list the distinct values in Label
df_pyspark.select("Label").distinct().show(truncate=False)

+-----------------------------+
|Label                        |
+-----------------------------+
|Extra Space Issues           |
|Special Character Differences|
|Case Sensitivity             |
+-----------------------------+



In [70]:
df_pyspark.select("Label_Index").distinct().show(truncate=False)

+-----------+
|Label_Index|
+-----------+
|0.0        |
|1.0        |
|2.0        |
+-----------+



In [51]:
from pyspark.sql.functions import col

df_pyspark = df_pyspark.filter(col("Label") != "Abbreviation vs. Full Form")



In [47]:
distinct_values

DataFrame[Label: string]

In [53]:

# Convert categorical label (text) into numerical index
indexer = StringIndexer(inputCol="Label", outputCol="Label_Index")
df_pyspark = indexer.fit(df_pyspark).transform(df_pyspark)

In [60]:
# Select relevant features
feature_columns = ["Case Sensitivity Diff", "Special Character Diff", "Extra Space Diff"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

In [63]:
# List of columns to cast
columns_to_cast = ["Case Sensitivity Diff", "Special Character Diff", "Extra Space Diff"]

# Cast each column to Integer
for col_name in columns_to_cast:
    df_pyspark = df_pyspark.withColumn(col_name, col(col_name).cast("int"))

In [64]:
df_pyspark.printSchema()

root
 |-- Source: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- Label: string (nullable = true)
 |-- Case-Sensitive Score: string (nullable = true)
 |-- Case-Insensitive Score: string (nullable = true)
 |-- Case Sensitivity Diff: integer (nullable = true)
 |-- Special Character Diff: integer (nullable = true)
 |-- Extra Space Diff: integer (nullable = true)
 |-- Label_Index: double (nullable = false)



In [65]:
df_pyspark = assembler.transform(df_pyspark)

In [66]:
# Split dataset into training (80%) and testing (20%) sets
train_data, test_data = df_pyspark.randomSplit([0.8, 0.2], seed=42)

In [67]:
# Initialize and train the classifier (Random Forest)
rf_classifier = RandomForestClassifier(labelCol="Label_Index", featuresCol="features", numTrees=50)
model = rf_classifier.fit(train_data)

# Make predictions on the test set
predictions = model.transform(test_data)

# Evaluate the model using accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="Label_Index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"✅ Model Accuracy: {accuracy * 100:.2f}%")

# Save trained model
model.save("FuzzyMatching_RF_Model")
print("✅ Model saved successfully!")


✅ Model Accuracy: 100.00%
✅ Model saved successfully!


In [69]:
predictions.show()

+--------------------+--------------------+--------------------+--------------------+----------------------+---------------------+----------------------+----------------+-----------+--------------+--------------+-------------+----------+
|              Source|         Destination|               Label|Case-Sensitive Score|Case-Insensitive Score|Case Sensitivity Diff|Special Character Diff|Extra Space Diff|Label_Index|      features| rawPrediction|  probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------------------+---------------------+----------------------+----------------+-----------+--------------+--------------+-------------+----------+
|Artificial Intell...|A!r_ti$f-i@c.i!al...|Special Character...|   73.01587301587303|     73.01587301587303|                    0|                    24|               0|        2.0|[0.0,24.0,0.0]|[0.0,0.0,50.0]|[0.0,0.0,1.0]|       2.0|
|Artificial Intell...|A!rtif@icia-l# _I...|Speci

In [None]:
case_sensitive_score = fuzz.ratio(str1, str2)
    case_insensitive_score = fuzz.ratio(str1.lower(), str2.lower())
    case_sensitivity_diff = abs(case_sensitive_score - case_insensitive_score)

    str1_clean, str2_clean = re.sub(r'[^\w\s]', '', str1), re.sub(r'[^\w\s]', '', str2)
    special_char_score = fuzz.ratio(str1_clean, str2_clean)
    special_char_diff = abs(case_sensitive_score - special_char_score)

    str1_space_norm, str2_space_norm = ' '.join(str1.split()), ' '.join(str2.split())
    space_score = fuzz.ratio(str1_space_norm, str2_space_norm)
    space_diff = abs(case_sensitive_score - space_score)

In [89]:
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import re
from rapidfuzz import fuzz
from pyspark.ml.linalg import Vectors, VectorUDT

# Initialize Spark session
spark = SparkSession.builder.appName("FuzzyMatchingPrediction").getOrCreate()

# Load trained model
model_path = "/content/FuzzyMatching_RF_Model"
model = RandomForestClassificationModel.load(model_path)

# Function to compute fuzzy similarity scores
def get_fuzzy_scores(str1, str2):
    """ Compute fuzzy matching scores and return as a DenseVector for Spark MLlib. """
    case_sensitive_score = fuzz.ratio(str1, str2)
    case_insensitive_score = fuzz.ratio(str1.lower(), str2.lower())
    case_sensitivity_diff = abs(case_sensitive_score - case_insensitive_score)

    str1_clean, str2_clean = re.sub(r'[^\w\s]', '', str1), re.sub(r'[^\w\s]', '', str2)
    special_char_score = fuzz.ratio(str1_clean, str2_clean)
    special_char_diff = abs(case_sensitive_score - special_char_score)

    str1_space_norm, str2_space_norm = ' '.join(str1.split()), ' '.join(str2.split())
    space_score = fuzz.ratio(str1_space_norm, str2_space_norm)
    space_diff = abs(case_sensitive_score - space_score)

    token_score = fuzz.token_set_ratio(str1, str2)
    abbreviation_diff = abs(case_sensitive_score - token_score)

    # Ensure output is a DenseVector (needed for Spark ML)
    return Vectors.dense([ case_sensitivity_diff,
                          special_char_diff, space_diff])

# UDF to convert string pairs into DenseVector for Spark ML
fuzzy_udf = udf(get_fuzzy_scores, VectorUDT())

# Example input words
word1 = "srin#^iVas"
word2 = "SrinIvAS"

# Convert input into DataFrame for prediction
df_input = spark.createDataFrame([(word1, word2)], ["Source", "Destination"])

# Apply fuzzy matching function
df_input = df_input.withColumn("features", fuzzy_udf(col("Source"), col("Destination")))




In [113]:
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import re
from rapidfuzz import fuzz
from pyspark.ml.linalg import Vectors, VectorUDT

# Initialize Spark session
spark = SparkSession.builder.appName("FuzzyMatchingPrediction").getOrCreate()

# Load trained model
model_path = "/content/FuzzyMatching_RF_Model"
model = RandomForestClassificationModel.load(model_path)

# Function to compute fuzzy similarity scores
def get_fuzzy_scores(str1, str2):
    """ Compute fuzzy matching scores and return as a DenseVector for Spark MLlib. """
    case_sensitive_score = fuzz.ratio(str1, str2)
    case_insensitive_score = fuzz.ratio(str1.lower(), str2.lower())
    case_sensitivity_diff = abs(case_sensitive_score - case_insensitive_score)

    str1_clean, str2_clean = re.sub(r'[^\w\s]', '', str1), re.sub(r'[^\w\s]', '', str2)
    special_char_score = fuzz.ratio(str1_clean, str2_clean)
    special_char_diff = abs(case_sensitive_score - special_char_score)

    str1_space_norm, str2_space_norm = ' '.join(str1.split()), ' '.join(str2.split())
    space_score = fuzz.ratio(str1_space_norm, str2_space_norm)
    space_diff = abs(case_sensitive_score - space_score)

    token_score = fuzz.token_set_ratio(str1, str2)
    abbreviation_diff = abs(case_sensitive_score - token_score)

    # Ensure output is a DenseVector (needed for Spark ML)
    return Vectors.dense([ case_sensitivity_diff,
                          special_char_diff, space_diff])

# UDF to convert string pairs into DenseVector for Spark ML
fuzzy_udf = udf(get_fuzzy_scores, VectorUDT())

# Example input words
word1 = "srin  vas"
word2 = "srinivas"

# Convert input into DataFrame for prediction
df_input = spark.createDataFrame([(word1, word2)], ["Source", "Destination"])

# Apply fuzzy matching function
df_input = df_input.withColumn("features", fuzzy_udf(col("Source"), col("Destination")))




In [114]:
Case Sensitivity Diff|Special Character Diff|Extra Space Diff

SyntaxError: invalid syntax (<ipython-input-114-3e5756aa53d2>, line 1)

In [115]:
df_input.show()

+---------+-----------+--------------------+
|   Source|Destination|            features|
+---------+-----------+--------------------+
|srin  vas|   srinivas|[0.0,0.0,5.147058...|
+---------+-----------+--------------------+



In [116]:
# Make prediction
prediction = model.transform(df_input).select("prediction").collect()[0][0]
prediction

1.0

In [117]:
# Map prediction index to label
label_mapping = {
    0.0: "Extra Space Issues",
    1.0: "Special Character Differences",
    2.0: "Case Sensitivity"
}
predicted_label = label_mapping.get(prediction, "Unknown")

print(f"Predicted Category: {predicted_label}")

Predicted Category: Special Character Differences
