In [1]:
import pandas as pd
import difflib
import re
from collections import deque
from logger import logger

In [2]:
df = pd.read_excel("D:\\Documents\\input file.xlsx")

In [3]:
print(df.columns)

Index(['SrNo', 'House_Flat_Number', 'House_Flat_Number_MatchScore', 'Town',
       'Street_Road_Name', 'Street_Road_Name_MatchScore',
       'Street_Road_Name_MatchScore.1', 'City', 'City_MatchScore',
       ' Floor_Number', 'Floor_Number_MatchScore', 'Country', 'PINCODE',
       'PINCODE_MatchScore', 'Premise_Building_Name',
       'Premise_Building_Name_MatchScore', 'Landmark', 'Landmark_MatchScore',
       'State', 'State_MatchScore', 'Name', 'Name_Extracted_From_OVD',
       'Name_match_percentage', 'Name_MatchScore', 'UID',
       'UID_Extracted_From_OVD', 'UID_MatchScore',
       'Address_Extracted_From_OVD', 'Final_Address_Match',
       'Final_Address_MatchScore', 'Overall_Match', 'Final_Remarks',
       'Document_Type'],
      dtype='object')


In [4]:
df

Unnamed: 0,SrNo,House_Flat_Number,House_Flat_Number_MatchScore,Town,Street_Road_Name,Street_Road_Name_MatchScore,Street_Road_Name_MatchScore.1,City,City_MatchScore,Floor_Number,...,Name_MatchScore,UID,UID_Extracted_From_OVD,UID_MatchScore,Address_Extracted_From_OVD,Final_Address_Match,Final_Address_MatchScore,Overall_Match,Final_Remarks,Document_Type
0,SR1,Flat 404,,Noida,Buddha Nagar,,,Noida,,4th,...,,424831815689,424831800000.0,,,,,,,
1,SR2,E-707,,Undari,Saswad Road,,,Pune,,2ND,...,,856364454912,856364500000.0,,"W/O Bhavesh Shah, E-707 Godrej Prana, Kondhwa ...",,,,,
2,SR3,C-403,,Mundhava,Hanuman Nagar,,,Pune,,,...,,673496894312,673496900000.0,,"MAHESH DATTU HIRE, C-403-SAVLI SAFFRON, HANUMA...",,,,,
3,SR4,Flat 404,,Noida,Buddha Nagar,,,Noida,,4th,...,,673496894312,,,,,,,,


In [5]:
def normalize_address(address):
    
    # Convert to lowercase
    address = address.lower()
    
    # Remove extra spaces
    address = re.sub(r'\s+', ' ', address.strip())
    
    # Replace common abbreviations with full forms
    abbreviations = {
        "st": "street",
        "rd": "road",
        "ave": "avenue",
        "blvd": "boulevard",
        "dr": "drive",
        "ln": "lane",
        "ct": "court",
        "pl": "place",
        "apt": "apartment",
        "fl": "floor",
        "#": "number"
    }
    for abbr, full in abbreviations.items():
        address = re.sub(rf'\b{abbr}\b', full, address)
    
    # Remove any special characters (optional)
    address = re.sub(r'[^\w\s]', '', address)
    
    return address


In [6]:
# Name Matching Rules

# Rule 1: Exact Name Match
def exact_letter_match(Name, Name_Extracted_From_OVD):
    return Name.strip().lower() == Name_Extracted_From_OVD.strip().lower()

# Rule 2: Abbreviated Name Match
def abbreviated_name_match(Name, Name_Extracted_From_OVD):
    input_parts = Name.strip().split()
    extracted_parts = Name_Extracted_From_OVD.strip().split()
    
    if len(extracted_parts) == 1:
        return input_parts[0][0].lower() == extracted_parts[0][0].lower()
    return False

# Rule 3: Ignore Middle Names (Match First and Last Names)
def ignore_middle_names(Name, Name_Extracted_From_OVD):
    input_parts = Name.strip().split()
    extracted_parts = Name_Extracted_From_OVD.strip().split()
    
    if len(input_parts) < 2 or len(extracted_parts) < 2:
        return False
    
    return input_parts[0].lower() == extracted_parts[0].lower() and input_parts[-1].lower() == extracted_parts[-1].lower()

# Rule 4: Circular Match (One Name Is a Rotation of the Other)
def circular_match(Name, Name_Extracted_From_OVD):
    input_name = Name.strip().lower()
    extracted_name = Name_Extracted_From_OVD.strip().lower()
    
    if len(input_name) != len(extracted_name):
        return False
    
    return extracted_name in (input_name + input_name)

# Rule 5: Single Letter Abbreviation Match (First or Last Name)
def single_letter_abbreviation(Name, Name_Extracted_From_OVD):
    input_parts = Name.strip().split()
    extracted_name = Name_Extracted_From_OVD.strip().lower()
    
    if len(extracted_name) != 1:
        return False
    
    return input_parts[0][0].lower() == extracted_name or input_parts[-1][0].lower() == extracted_name

# Rule 6: Part Match (Any Part of Name Matches)
def match_any_part(Name, Name_Extracted_From_OVD):
    input_parts = Name.strip().split()
    extracted_parts = Name_Extracted_From_OVD.strip().split()
    
    return any(part in input_parts for part in extracted_parts)

# Main Function to Match Names Based on Various Rules
def name_match(Name, Name_Extracted_From_OVD):
    match_score = 0
    
    if exact_letter_match(Name, Name_Extracted_From_OVD):
        match_score += 20
    if abbreviated_name_match(Name, Name_Extracted_From_OVD):
        match_score += 20
    if ignore_middle_names(Name, Name_Extracted_From_OVD):
        match_score += 20
    if match_any_part(Name, Name_Extracted_From_OVD):
        match_score += 20
    if circular_match(Name, Name_Extracted_From_OVD):
        match_score += 20
    if single_letter_abbreviation(Name, Name_Extracted_From_OVD):
        match_score += 20
    
    return min(match_score, 100)


In [7]:
# Function to calculate similarity ratio

from difflib import SequenceMatcher

def similarity_ratio(Name, Name_Extracted_From_OVD):
    
    return SequenceMatcher(None, Name, Name_Extracted_From_OVD).ratio()

In [8]:
# Address Matching Rules

# Normalize address by converting it to lowercase and removing extra spaces
def normalize_address(address):
    address = address.lower()
    address = ' '.join(address.split())
    return address

# Functions to match specific address components
def house_flat_match(House_Flat_Number, Address_Extracted_From_OVD):
    # Extract house/flat number from addresses
    house_flat_input = re.search(r'\b\d+[a-zA-Z]?\b', House_Flat_Number )
    house_flat_extracted = re.search(r'\b\d+[a-zA-Z]?\b', Address_Extracted_From_OVD)
    if house_flat_input and house_flat_extracted:
        return similarity_ratio(house_flat_input.group(), house_flat_extracted.group()) * 100
    return 0

def street_road_match(Street_Road_Name, Address_Extracted_From_OVD):
    # Extract street/road name from addresses
    input_address = normalize_address(Street_Road_Name)
    extracted_address = normalize_address(Address_Extracted_From_OVD)
    input_street = ' '.join([word for word in input_address.split() if not word.isdigit()])
    extracted_street = ' '.join([word for word in extracted_address.split() if not word.isdigit()])
    return similarity_ratio(input_street, extracted_street) * 100

def city_match(City, Address_Extracted_From_OVD):
    # Extract city from addresses
    city_input = re.search(r'\b[a-zA-Z]+\b', City, re.IGNORECASE)
    city_extracted = re.search(r'\b[a-zA-Z]+\b', Address_Extracted_From_OVD, re.IGNORECASE)
    if city_input and city_extracted:
        return similarity_ratio(city_input.group(), city_extracted.group()) * 100
    return 0

def floor_number_match(Floor_Number, Address_Extracted_From_OVD):
    # Extract floor number from addresses
    floor_input = re.search(r'\b\d+[a-zA-Z]*\b', Floor_Number)
    floor_extracted = re.search(r'\b\d+[a-zA-Z]*\b', Address_Extracted_From_OVD)
    if floor_input and floor_extracted:
        return similarity_ratio(floor_input.group(), floor_extracted.group()) * 100
    return 0

def pincode_match(PINCODE, Address_Extracted_From_OVD):
    # Extract pin code from addresses
    pincode_input = re.search(r'\b\d{6}\b', PINCODE)
    pincode_extracted = re.search(r'\b\d{6}\b', Address_Extracted_From_OVD)
    if pincode_input and pincode_extracted:
        return similarity_ratio(pincode_input.group(), pincode_extracted.group()) * 100
    return 0

def premise_building_match(Premise_Building_Name, Address_Extracted_From_OVD):
    # Extract premise/building from addresses
    premise_input = re.search(r'\b[a-zA-Z]+\b', Premise_Building_Name, re.IGNORECASE)
    premise_extracted = re.search(r'\b[a-zA-Z]+\b', Address_Extracted_From_OVD, re.IGNORECASE)
    if premise_input and premise_extracted:
        return similarity_ratio(premise_input.group(), premise_extracted.group()) * 100
    return 0

def landmark_match(Landmark, Address_Extracted_From_OVD):
    # Extract landmark from addresses if any (e.g., near, beside)
    landmark_input = re.search(r'\bnear\b.*', Landmark, re.IGNORECASE)
    landmark_extracted = re.search(r'\bnear\b.*', Address_Extracted_From_OVD, re.IGNORECASE)
    if landmark_input and landmark_extracted:
        return similarity_ratio(landmark_input.group(), landmark_extracted.group()) * 100
    return 0

def state_match(State, Address_Extracted_From_OVD):
    # Extract state from addresses
    state_input = re.search(r'\b[a-zA-Z]+\b', State, re.IGNORECASE)
    state_extracted = re.search(r'\b[a-zA-Z]+\b', Address_Extracted_From_OVD, re.IGNORECASE)
    if state_input and state_extracted:
        return similarity_ratio(state_input.group(), state_extracted.group()) * 100
    return 0

# Function to match addresses based on normalization and field-specific matching
def address_match(House_Flat_Number, Street_Road_Name, City, Floor_Number, PINCODE, Premise_Building_Name, Landmark, State, Address_Extracted_From_OVD, cutoff=70):
    # Match score for each address component
    house_flat_score = House_Flat_Number_MatchScore(House_Flat_Number, Address_Extracted_From_OVD)
    street_road_score = Street_Road_Name_MatchScore(Street_Road_Name, Address_Extracted_From_OVD)
    city_score = City_MatchScore(City, Address_Extracted_From_OVD)
    floor_number_score = Floor_Number_MatchScore(Floor_Number, Address_Extracted_From_OVD)
    pincode_score = PINCODE_MatchScore(PINCODE, Address_Extracted_From_OVD)
    premise_building_score = Premise_Building_Name_MatchScore(Premise_Building_Name, Address_Extracted_From_OVD)
    landmark_score = Landmark_MatchScore(Landmark, Address_Extracted_From_OVD)
    state_score = State_MatchScore(State, Address_Extracted_From_OVD)
    
    # Calculate the total match score
    total_score = (
        house_flat_score + street_road_score + city_score + floor_number_score +
        pincode_score + premise_building_score + landmark_score + state_score
    ) / 8  # Average score for all components
    
    # Check if the match score is above the cutoff
    if total_score >= cutoff:
        return True, total_score  # Match found with score
    else:
        return False, total_score  # No match or score below cutoff



In [9]:
# Weighted sum of the individual scores
def address_match(House_Flat_Number, Street_Road_Name, City, Floor_Number, PINCODE, Premise_Building_Name, Landmark, State, Address_Extracted_From_OVD, cutoff=70):
    house_flat_score = House_Flat_Number_MatchScore(House_Flat_Number, Address_Extracted_From_OVD)
    street_road_score = Street_Road_Name_MatchScore(Street_Road_Name, Address_Extracted_From_OVD)
    city_score = City_MatchScore(City, Address_Extracted_From_OVD)
    floor_number_score = Floor_Number_MatchScore(Floor_Number, Address_Extracted_From_OVD)
    pincode_score = PINCODE_MatchScore(PINCODE, Address_Extracted_From_OVD)
    premise_building_score = Premise_Building_Name_MatchScore(Premise_Building_Name, Address_Extracted_From_OVD)
    landmark_score = Landmark_MatchScore(Landmark, Address_Extracted_From_OVD)
    state_score = State_MatchScore(State, Address_Extracted_From_OVD)
    
    # Weighted calculation
    final_score = (
         house_flat_score * 0.15 + 
         street_road_score * 0.15 + 
         city_score * 0.10 + 
         floor_number_score * 0.10 + 
         pincode_score * 0.15 + 
         premise_building_score * 0.10 + 
         landmark_score * 0.10 + 
         state_score * 0.15
    )

    # Return final score if it meets or exceeds the cutoff
    if final_score >= cutoff:
        return final_score
    return 0


In [10]:
# Function to check exact match for UID

df['UID_MatchScore'] = df['UID'].astype(str).str.strip() == df['UID_Extracted_From_OVD'].astype(str).str.strip()

print(df)


  SrNo House_Flat_Number  House_Flat_Number_MatchScore      Town  \
0  SR1          Flat 404                           NaN     Noida   
1  SR2             E-707                           NaN    Undari   
2  SR3             C-403                           NaN  Mundhava   
3  SR4          Flat 404                           NaN     Noida   

  Street_Road_Name  Street_Road_Name_MatchScore  \
0     Buddha Nagar                          NaN   
1      Saswad Road                          NaN   
2    Hanuman Nagar                          NaN   
3     Buddha Nagar                          NaN   

   Street_Road_Name_MatchScore.1   City  City_MatchScore  Floor_Number  ...  \
0                            NaN  Noida              NaN           4th  ...   
1                            NaN   Pune              NaN           2ND  ...   
2                            NaN   Pune              NaN           NaN  ...   
3                            NaN  Noida              NaN           4th  ...   

   Name

In [11]:
def overall_match(Name, Name_Extracted_From_OVD, House_Flat_Number, Street_Road_Name, City, Floor_Number, PINCODE, Premise_Building_Name, Landmark, State, Address_Extracted_From_OVD, UID, UID_Extracted_From_OVD, cutoff=70):
    
    # Calculate individual scores
    name_score = Name_MatchScore(Name, Name_Extracted_From_OVD )
    address_score = Final_Address_MatchScore(House_Flat_Number, Street_Road_Name, City, Floor_Number, PINCODE, Premise_Building_Name, Landmark, State, Address_Extracted_From_OVD)
    uid_score = UID_MatchScore(UID, UID_Extracted_From_OVD)

    # Determine if the overall score meets the cutoff
    match_status = overall_score >= cutoff

    # Return results as a dictionary
    return {
        "Name_MatchScore": name_score,
        "Final_Address_MatchScore": address_score,
        "UID_MatchScore": uid_score,
        "Overall_Match": overall_score,
        "Match Status": match_status
    }
