In [108]:
!pip install transformers==4.31.0


[0mCollecting transformers==4.31.0
  Using cached transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.31.0-py3-none-any.whl (7.4 MB)
Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[0mInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.0
[1;31merror[0m: [1muninstall-no-record-file[0m

[31m×[0m Cannot uninstall tokenizers 0.21.0
[31m╰─>[0m The package's contents are unknown: no RECORD file was found for tokenizers.

[1;36mhint[0m: You might be able to recover from this via: [32mpip install --force-reinstall --no-deps tokenizers==0.21.0[0m


In [28]:
pip install huggingface-hub==0.24.5


Note: you may need to restart the kernel to use updated packages.


In [115]:
from sentence_transformers import SentenceTransformer, models
from datasets import load_dataset
import transformers
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from fuzzywuzzy import fuzz
import re
import numpy as np
import pickle
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance
import jellyfish

In [116]:
print(transformers.__version__)

4.31.0


In [117]:
class BertForSTS(torch.nn.Module):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = models.Transformer('bert-base-uncased', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output

In [118]:
class STSBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset):

        similarity_scores = [i['labels'] for i in dataset]
        self.normalized_similarity_scores = [i/5.0 for i in similarity_scores]
        self.first_sentences = [i['name1'] for i in dataset]
        self.second_sentences = [i['name2'] for i in dataset]
        self.concatenated_sentences = [[str(x), str(y)] for x,y in zip(self.first_sentences, self.second_sentences)]

    def __len__(self):
        return len(self.concatenated_sentences)

    def get_batch_labels(self, idx):
        return torch.tensor(self.normalized_similarity_scores[idx])

    def get_batch_texts(self, idx):
        return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y


def collate_fn(texts):
    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']
    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
    return features

In [119]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA L4


In [120]:
def predict_similarity_embedding_model(sentence_pair):
    """
    Predict similarity between a pair of sentences
    """
    test_input = tokenizer(sentence_pair, padding='max_length', max_length=128, truncation=True, return_tensors="pt").to(device)
    test_input['input_ids'] = test_input['input_ids']
    test_input['attention_mask'] = test_input['attention_mask']
    del test_input['token_type_ids']
    output = model(test_input)
    sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()
    return sim

In [121]:
import pickle
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)
    
with open('tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

In [122]:
sim = predict_similarity_embedding_model(["manmeet singh", "singh manmeet"])

In [123]:
sim

0.7494423389434814

In [124]:
KEYWORDS = [
    "traders","trading", "enterprise", "garments", "collection", "food", "clothes", 
    "glass", "fittings", "digital", "kirana", "medical", "agency","tex",'logistics',
    "security", "systems", "badges", "hospitality", "jewellers",'lic','agent',
    "ready-made", "store", "hospital", "restaurant", "auto", "center", 
    "dairy", "home", "products", "services", "furniture", "hardware", 
    "pharmacy", "stationery", "treatments", "nutrition", "wellness", 
    "sweets", "resort", "kitchen", "clothing", "market",'workshop','agency','consumer','amale' 
    "poultry", "seeds", "pesticides", "sales", "cafe", "clinic", 'project'
    "supermart", "distributors", "automobiles", "electricity", 
    "electronics", "general", "provision", "fertilizers", "agriculture", 
    "beverages", "textiles", "plumbing", "supplies", "handicrafts", 
    "construction", "medical", "bakery", "tissue", "cleaning", 
    "appliances", "homecare", "kitchenware", "decor", "glass and fittings",
    "interiors", "shopping", "crafts", "tools", "wholesale", 
    "retail", "outlet", "merchants", "trade", "distribution", 
    "solutions", "innovation", "consultancy", "services", "equipment", 
    "manufacturing", "exports", "imports", "packaging", "network", 
    "consultants", "transport", "moving", "storage", "logistics", 
    "construction", "real estate",'distributor','wines','hardware',
    'plywood','company','craft','soda','station','mobile','brothers','gas','trad','plywood','hp'
    "brokerage", "management", 'handloom','co.','tvs','marketing',
    "finance", "investment", "funding", "support", "technology", 
    "software", "applications", "digital marketing", "advertising", 
    "communication", "entertainment", "events", "tourism", "travel", 
    "transportation", "automotive", "services", "supply chain", 
    "fashion", "cosmetics", "beauty", "spa", "wellness", "glass and fitting",
    "personal care", "gifts", "custom", "specialty", "craftsmanship", 
    "fashions", "motors", "enterprises", "garment", "cloth centre", "mart", 
    "foods", "silk and readymade", "wool centre", "jewellery", "mill", 
    "farms", "farm", "electrical", "egg centre", "centre", 
    "vegetable and fruits", "vegetables", "fruits", "pvt", "pvt ltd", 
    "limited", "solutions", "energies", "photo", "studio", "works", 
    "associates", "medico", "agencies", "diagnosis", "cool drinks", 
    "drinks", "care", "liquor", "automobiles", "materials", "diagnostics", 
    "provision", "trader", "farms", "farm", "stations", "restaurant", 
    "creations", "travels", "hardware", "printers", "graphics", 
    "fertilisers", "house", "studio", "private", "appliances", "steels", 
    "shop", "metals", "international", "jwellers", "corporation", 
    "dresses", "industries", "electricals", "company", "lim", "colddrinks", 
    "electron", "medicines", "llc", "computers", "hotel", "spa", 
    "cosmetics", "telecom", "sarees", "petroleums", "bhandar",'store','stores', 
    "surgical", "wines", "constructions", "shoppy", "lab", "builders", 
    "footwear", "wear", "shoe", "repair", "ventures", "paint", "depot",'cake','chinies',
    "tent", "decorators", "communications", "pharmacy", "products",'textile','CERAMIC','Pharmaceuticals','stores','sons',
]

In [125]:
len(KEYWORDS)

255

In [126]:
SPECIAL_CHAR_DOT_REGEX = r"[.]"
SPECIAL_CHARS_REGEX = r"[-+.^:,_/\s]+" 
SALUTATION_REGEX = r"^(shree|shri|miss|smt|mrs|mr|ms|dr|master|hon|sir|madam|prof|capt|major|rev|fr|br)\s*"
PARENT_SPOUSE_NAME_REGEX = r"(?:\s*(?:s/o|d/o|w/o|so|do|wo|daughter of|son of|wife of|husband of)\s*)"
COMMON_MUSLIM_SALUTATIONS_MOHAMMAD_REGEX = r"\b(mohammad|mohammed|muhamed|mohd|mohamed|mohamad|muhamad|muhammad|muhammed|muhammet|mohamud|mohummad|mohummed|mouhamed|muhamaad|mohammod|mouhamad|mo|md|mahmood|mahmud|ahmad|ahmed|hameed|hamid|hammed|mahd|mahmod|mohd|mouhammed|mohamad|muhmood|mohhammed|muhmamed|mohmed|mohmat|muhmat|mu|m|shaikh|mo)\b"
LAST_NAMES_AGARWAL_VARIANTS_REGEX = r"\b(aggarwal|agrawal|agarwal|aggrawal|agarwalla|agarwal)\b"



def convert_to_lower(name):
    if not isinstance(name, str):
        name = str(name) if name is not None else ""
    return name.lower()

def replace_adjacent_duplicates(value):
    if isinstance(value, str):
        return re.sub(r'(.)\1+', r'\1', value)
    return value

def replace_characters(name):
    replacements = {'e': 'i', 'j': 'z', 'v': 'w', 'q': 'k'}
    for old, new in replacements.items():
        name = name.replace(old, new)
    return name

def replace_bigrams(name):
    replacements = {'ph': 'f', 'gh': 'g', 'th': 't', 'kh': 'k', 'dh': 'd', 'ch': 'c', 'sh': 's', 'au': 'o',
                    'bh': 'b', 'ks': 'x', 'ck': 'k', 'ah': 'h', 'wh': 'w', 'wr': 'r'}
    for old, new in replacements.items():
        name = name.replace(old, new)
    return name

def remove_extra_spaces(name):
    return re.sub(r'\s+', ' ', name).strip()

def remove_consonant_a(name):
    consonants = 'bcdfghjklmnpqrstvwxyz'
    new_name = ''.join([name[i] for i in range(len(name)) if not (i > 0 and name[i] == 'a' and name[i - 1].lower() in consonants)])
    return new_name

def remove_special_characters(text):
    text = re.sub(SPECIAL_CHAR_DOT_REGEX, '', text)
    text = re.sub(SPECIAL_CHARS_REGEX, '', text)
    return text.strip()

def remove_salutations(text):
    return re.sub(SALUTATION_REGEX, '', text, flags=re.IGNORECASE).strip()

def remove_parent_spouse_name(text):
    return re.sub(r'\s*(?:s[\s./]*o|d[\s./]*o|w[\s./]*o|son[\s]*of|daughter[\s]*of|wife[\s]*of|husband[\s]*of|child[\s]*of)\s+[\w\s,.]*$', '', text, flags=re.IGNORECASE).strip()

def remove_common_muslim_variations(text):
    return re.sub(COMMON_MUSLIM_SALUTATIONS_MOHAMMAD_REGEX, '', text, flags=re.IGNORECASE).strip()

def remove_agarwal_variants(text):
    return re.sub(LAST_NAMES_AGARWAL_VARIANTS_REGEX, '', text, flags=re.IGNORECASE).strip()

def remove_stop_words(text):
    stop_words = ['devi', 'dei', 'debi', 'kmr', 'kumr','bhai', 'bhau', 'bai', 'ben', 'kaur', 'Md', 'Mohd', 'Mohammad', 'Mohamad','alam','shekh','sek']
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)



In [127]:
def preprocess_layer1(name):
    name = str(name) if name is not None else ""
    name = convert_to_lower(name)
    name = remove_salutations(name)
    name = remove_parent_spouse_name(name)
    name = remove_stop_words(name)
    name = remove_extra_spaces(name)
    return name

def check_keywords_layer1(name1, name2):
    name1 = str(name1) if name1 is not None else ""
    name2 = str(name2) if name2 is not None else ""

    found_in_name1 = any(keyword in name1.lower() for keyword in KEYWORDS)
    found_in_name2 = any(keyword in name2.lower() for keyword in KEYWORDS)

    if found_in_name1 and found_in_name2:
        return 1  
    elif found_in_name1 or found_in_name2:
        return 0
    return 1
    
def calculate_fuzzy_similarity_layer1(name1, name2):
    name1 = preprocess_layer1(name1)
    name2 = preprocess_layer1(name2)
    
    if not isinstance(name1, str):
        name1 = str(name1) if name1 is not None else ""
    if not isinstance(name2, str):
        name2 = str(name2) if name2 is not None else ""


    fuzzy_ratio = fuzz.ratio(name1, name2) / 100.0
    fuzzy_partial_ratio = fuzz.partial_ratio(name1, name2) / 100.0
    fuzzy_token_sort_ratio = fuzz.token_sort_ratio(name1, name2) / 100.0

    fuzzy_similarity = (fuzzy_ratio + fuzzy_partial_ratio + fuzzy_token_sort_ratio ) / 3.0
    return fuzzy_similarity

def fuzzy_layer1(name1, name2):
    fuzzy_SS = calculate_fuzzy_similarity_layer1(name1, name2)
    Prediction = 0  
    if fuzzy_SS >= 0.80:  
        keyword_flag = check_keywords_layer1(name1, name2)
        Prediction = 1 if keyword_flag == 1 else 0  
    return fuzzy_SS, Prediction
#-----------------------------------------Data Preprocessing anf Framework--------------------------------------------------------------------------------------------------



def preprocess_layer2(name):
    name = remove_salutations(name)
    name = remove_parent_spouse_name(name)
    name = remove_common_muslim_variations(name)
    name = remove_agarwal_variants(name)
    name = convert_to_lower(name)
    name = replace_adjacent_duplicates(name)
    name = replace_characters(name)
    name = replace_bigrams(name)
    name = remove_consonant_a(name)
    name = remove_special_characters(name)
    name = remove_extra_spaces(name)
    name = remove_stop_words(name)
    return name


import pickle
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)
    
with open('tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)
    

def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings


def calculate_cosine_similarity(embedding1, embedding2):
    return cosine_similarity(embedding1, embedding2).item()

def calculate_levenshtein_similarity(name1, name2):
    lev_distance = levenshtein_distance(name1, name2)
    max_len = max(len(name1), len(name2))
    return (max_len - lev_distance) / max_len if max_len > 0 else 1.0

def calculate_phonetic_similarity(name1, name2):
    soundex1 = jellyfish.soundex(name1)
    soundex2 = jellyfish.soundex(name2)
    return jellyfish.jaro_winkler_similarity(soundex1, soundex2)

def calculate_jaccard_similarity(name1, name2):
    set1, set2 = set(name1), set(name2)
    intersection, union = set1.intersection(set2), set1.union(set2)
    return len(intersection) / len(union) if union else 1.0

##------------------------------------Calling Name_Match------------------------------------------------------------------------------------------------------

    
# def name_match(name1, name2):
#     name1_processed = preprocess_layer2(name1)
#     name2_processed = preprocess_layer2(name2)

#     embedding_similarity = predict_similarity_embedding_model([name1_processed, name2_processed])
#     levenshtein_similarity = calculate_levenshtein_similarity(name1_processed, name2_processed)
#     phonetic_similarity = calculate_phonetic_similarity(name1_processed, name2_processed)
#     jaccard_similarity = calculate_jaccard_similarity(name1_processed, name2_processed)

#     fuzzy_ratio = fuzz.ratio(name1_processed, name2_processed) / 100.0
#     fuzzy_partial_ratio = fuzz.partial_ratio(name1_processed, name2_processed) / 100.0
#     fuzzy_token_sort_ratio = fuzz.token_sort_ratio(name1_processed, name2_processed) / 100.0

#     fuzzy_similarity = (fuzzy_ratio + fuzzy_partial_ratio + fuzzy_token_sort_ratio ) / 3.0

#     final_score = (
#         embedding_similarity * 0.40 +
#         phonetic_similarity * 0.30 +
#         jaccard_similarity * 0.30 
#     )
  

#     return {
#         "name1": name1,
#         "name2": name2,
#         "embedding_similarity": embedding_similarity,
#         "phonetic_similarity": phonetic_similarity,
#         "jaccard_similarity": jaccard_similarity,
#         "final_score": final_score
#     }
# Define the list of keywords

def check_keywords_and_set_prediction(name1, name2, final_score, threshold=0.65):
    # Check for keywords in each name
    found_in_name1 = any(keyword in name1.lower() for keyword in KEYWORDS)
    found_in_name2 = any(keyword in name2.lower() for keyword in KEYWORDS)

    if found_in_name1 and found_in_name2:
        return 1
    elif found_in_name1 or found_in_name2:
        return 0
    return 1 if final_score > threshold else 0

def name_match(name1, name2):
    # Preprocess the names
    name1_processed = preprocess_layer2(name1)
    name2_processed = preprocess_layer2(name2)

    # Calculate similarity scores
    embedding_similarity = predict_similarity_embedding_model([name1_processed, name2_processed])
    levenshtein_similarity = calculate_levenshtein_similarity(name1_processed, name2_processed)
    phonetic_similarity = calculate_phonetic_similarity(name1_processed, name2_processed)
    jaccard_similarity = calculate_jaccard_similarity(name1_processed, name2_processed)

    # Calculate fuzzy similarity metrics
    fuzzy_ratio = fuzz.ratio(name1_processed, name2_processed) / 100.0
    fuzzy_partial_ratio = fuzz.partial_ratio(name1_processed, name2_processed) / 100.0
    fuzzy_token_sort_ratio = fuzz.token_sort_ratio(name1_processed, name2_processed) / 100.0
    fuzzy_similarity = (fuzzy_ratio + fuzzy_partial_ratio + fuzzy_token_sort_ratio) / 3.0

    # Compute the final score with weights
    final_score = (
        embedding_similarity * 0.286461 +
        phonetic_similarity * 0.123011 +
        jaccard_similarity * 0.590528
    )
    # Determine prediction based on keywords and final score
    prediction = check_keywords_and_set_prediction(name1, name2, final_score)

    # Return results as a dictionary
    return {
        "name1": name1,
        "name2": name2,
        "embedding_similarity": embedding_similarity,
        "phonetic_similarity": phonetic_similarity,
        "jaccard_similarity": jaccard_similarity,
        "final_score": final_score,
        "Prediction": prediction
    }


# ##------------------------------------Fuzzy With Data Preprocessing and keyword matching-------------------------------------------------------------------------------------------------------


def preprocess_FuzzyWuzzy(name):
    name = convert_to_lower(name)
    name = remove_salutations(name)
    name = remove_parent_spouse_name(name)
    name = remove_stop_words(name)
    name = remove_extra_spaces(name)
    return name

def calculate_fuzzy_similarity_processed(name1, name2):
    name1 = preprocess_FuzzyWuzzy(name1)
    name2 = preprocess_FuzzyWuzzy(name2)
    
    fuzzy_ratio = fuzz.ratio(name1, name2) / 100.0
    fuzzy_partial_ratio = fuzz.partial_ratio(name1, name2) / 100.0
    fuzzy_token_sort_ratio = fuzz.token_sort_ratio(name1, name2) / 100.0

    fuzzy_similarity = (fuzzy_ratio + fuzzy_partial_ratio + fuzzy_token_sort_ratio) / 3.0
    return fuzzy_similarity
 

# def check_keywords(name1, name2, KEYWORDS):
#     name1 = name1.lower()
#     name2 = name2.lower()
#     for keyword in KEYWORDS:
#         if keyword in name1 or keyword in name2:
#             return 0  
#     return 1

def check_keywords(name1, name2, keywords):
    name1 = str(name1) if name1 is not None else ""
    name2 = str(name2) if name2 is not None else ""

    found_in_name1 = any(keyword in name1.lower() for keyword in KEYWORDS)
    found_in_name2 = any(keyword in name2.lower() for keyword in KEYWORDS)

    if found_in_name1 and found_in_name2:
        return 1  
    elif found_in_name1 or found_in_name2:
        return 0
    return 1
    
def process_false_cases(row):
    name1 = row['name1']
    name2 = row['name2']

    fuzzy_SS = calculate_fuzzy_similarity_processed(name1, name2)

    fuzzy_flag = 1 if fuzzy_SS >= 0.70 else 0

    if fuzzy_flag:
        keyword_adjustment = check_keywords(name1, name2, KEYWORDS)
        fuzzy_flag = 1 if keyword_adjustment else 0

    return {
        "Third_Layer_Score": fuzzy_SS,
        "Prediction": int(fuzzy_flag),
        "Third_Layer_Flag": fuzzy_flag  
    }
    

In [128]:
model.eval()

BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)

In [129]:
import pandas as pd


def process_name_matching(file_path):
    df_layer1 = pd.read_csv(file_path)
    threshold = 0.80
    threshold1 = 0.65

    df_layer1["Prediction"] = 0  

    df_layer1[["First_Layer_Score", "first_layer_pass"]] = df_layer1.apply(
        lambda row: pd.Series([
            fuzzy_layer1(row['name1'], row['name2'])[0],  
            fuzzy_layer1(row['name1'], row['name2'])[0] >= threshold  
        ]),
        axis=1
    )

    df_layer1["Prediction"] = df_layer1["first_layer_pass"].astype(int)  

    df_layer1.to_csv("first_audit.csv", index=False)

    df_layer2 = df_layer1[df_layer1["Prediction"] == 0].copy()
    
    print("tttttttttttttt")
    print(df_layer2[['name1','name2']])

    df_layer2 = df_layer1[df_layer1["Prediction"] == 0].copy()

    # Ensure names are strings
    df_layer2['name2'] = df_layer2['name2'].astype(str)
    df_layer2['name1'] = df_layer2['name1'].astype(str)

    # Apply name matching logic directly
    name_match_results = df_layer2.apply(lambda row: name_match(row['name1'], row['name2']), axis=1)

    # Extract similarity scores and assign directly
    df_layer2 = df_layer2.assign(
        embedding_similarity=[result["embedding_similarity"] for result in name_match_results],
        phonetic_similarity=[result["phonetic_similarity"] for result in name_match_results],
        jaccard_similarity=[result["jaccard_similarity"] for result in name_match_results],
        final_score=[result["final_score"] for result in name_match_results],
        Prediction=[result["Prediction"] for result in name_match_results]  # Prediction is calculated inside name_match
    )

    # Save Layer 2 audit results
    df_layer2.to_csv("second_audit.csv", index=False)


    df_layer3 = df_layer2[df_layer2["Prediction"] == 0].copy()
    results = df_layer3.apply(process_false_cases, axis=1)
    
    df_layer3["Third_Layer_Score"] = results.apply(lambda x: x["Third_Layer_Score"])
    df_layer3["Prediction"] = results.apply(lambda x: x["Prediction"])
    df_layer3["Third_Layer_Flag"] = results.apply(lambda x: x["Third_Layer_Flag"])
    
    df_layer3.to_csv("third_audit.csv", index=False)

    df_combined = pd.concat([
        df_layer1[df_layer1["Prediction"] == 1],
        df_layer2[df_layer2["Prediction"] == 1],
        df_layer3[df_layer3["Prediction"] == 1],
        df_layer3[df_layer3["Prediction"] == 0]  
    ], ignore_index=True)

    df_combined.to_csv("RF_1WEEK(80-65-70).csv", index=False)

    return df_combined


if __name__ == "__main__":
    combined_data = process_name_matching("one_week_m.csv")

    print("\nConfusion Matrix for Each Layer:")


tttttttttttttt
                               name1                            name2
13                 N NIYAMATH NISHA        NIYAMATH NISHA NAVAS BASHA
14        Niyamath Nisha Navas Basha                N NIYAMATH NISHA 
16                            Adarsh               Aadarsh Dhanotiya 
28      DESHMUKH SANTOSH NARAYANRAO   Mr. SANTOSH NARAYANRAO DESHMUKH
33                          ROHITASH              Mr. ROHITASH  MEENA
...                              ...                              ...
113142               MAYASINGH DAWAR                     AAKASH DAWAR
113143               MAYASINGH DAWAR                     AAKASH DAWAR
113145                  Kavita meena              KAVITA SAREE CENTRE
113146                  Kavita meena              KAVITA SAREE CENTRE
113147                       SHABINA               COLORS COLLECTIONS

[29775 rows x 2 columns]

Confusion Matrix for Each Layer:


In [130]:
df_first=pd.read_csv('first_audit.csv')

In [98]:
df_first.shape

(499519, 16)

In [99]:
df_first['Prediction'].value_counts()

Prediction
1    416022
0     83497
Name: count, dtype: int64

In [131]:
df_second_audit=pd.read_csv('second_audit.csv')

In [132]:
df_second_audit.shape

(29775, 20)

In [133]:
df_second_audit['Prediction'].value_counts()

Prediction
0    19148
1    10627
Name: count, dtype: int64

In [134]:
df_third_audit=pd.read_csv("third_audit.csv")

In [135]:
df_third_audit.shape

(19148, 22)

In [136]:
df_third_audit['Prediction'].value_counts()

Prediction
0    15750
1     3398
Name: count, dtype: int64

In [1]:
import pandas as pd

In [95]:
df_2=pd.read_csv("one_month_data.csv")

  df_2=pd.read_csv("one_month_data.csv")


In [96]:
df_2.shape  

(499519, 13)

In [25]:
df_21=pd.read_csv("check_experiment6.csv")

  df_21=pd.read_csv("check_experiment6.csv")


In [26]:
df_21.shape

(499519, 23)

In [27]:
df_21['Prediction'].value_counts()

Prediction
1    436169
0     63350
Name: count, dtype: int64

In [100]:
df_211=pd.read_csv("check_experiment4.csv")

  df_211=pd.read_csv("check_experiment4.csv")


In [101]:
df_211.columns

Index(['_id', 'merchantId', 'consumerId', 'szScore', 'szFlag', 'hvScore',
       'hvFlag', 'name2', 'name1', 'dsScore', 'dsFlag', 'flScore', 'flFlag',
       'Prediction', 'First_Layer_Score', 'first_layer_pass',
       'embedding_similarity', 'phonetic_similarity', 'jaccard_similarity',
       'final_score', 'Second_Layer_Pass', 'Third_Layer_Score',
       'Third_Layer_Flag'],
      dtype='object')

In [None]:
flFlag,dsFlag,hvFlag,Prediction

In [102]:
df_211['Prediction'].value_counts()

Prediction
1    98528
0    14620
Name: count, dtype: int64

In [16]:
# import pandas as pd
# import numpy as np
# # from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# def process_name_matching(file_path):
#     df_layer1 = pd.read_csv(file_path)
#     threshold = 0.75
#     threshold1 = 0.65
    
    
#     df_layer1["Prediction"] = 0

# #     df_layer1["First_Layer_Pass"] = df_layer1.apply(fuzzy_layer1, axis=1)
# #     df_layer1["Prediction"] = df_layer1.apply(lambda x: 1 if x["First_Layer_Pass"] else x["Prediction"], axis=1)
#     df_layer1[["First_Layer_Score", "Prediction"]] = df_layer1.apply(
#     lambda row: pd.Series(fuzzy_layer1(row['name1'], row['name2'])),
#     axis=1 )
    
#     df_layer1.to_csv("first_audit.csv", index=False)
    
#     df_layer2 = df_layer1[df_layer1["Prediction"] == 0]  # Filter rows where Prediction == False
#     df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
#     df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
#     df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"]
    
#     df_layer2.to_csv("second_audit.csv", index=False)

#     # Third Layer
#     df_layer3 = df_layer2[df_layer2["Prediction"] == 0]  # Filter rows where Prediction == False
#     df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
#     df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]
    
#     df_layer3.to_csv("third_audit.csv", index=False)
   
#     df_combined = pd.concat([
#         df_layer1[df_layer1["Prediction"]],
#         df_layer2[df_layer2["Prediction"]],
#         df_layer3[df_layer3["Prediction"]]
#     ], ignore_index=True)

#     df_combined.to_csv("one_week_result.csv", index=False)

#     return df_combined

#     # Final output
#     df_layer1["Prediction"] = 0
#     df_layer1.loc[df_layer1[df_layer1["Prediction"]].index, "Prediction"] = 1
#     df_layer1.loc[df_layer2[df_layer2["Prediction"]].index, "Prediction"] = 1
#     df_layer1.loc[df_layer3[df_layer3["Prediction"]].index, "Prediction"] = 1

#     return df_combined


# if __name__ == "__main__":
#     combined_data = process_name_matching("test.csv")

#     print("\nConfusion Matrix for Each Layer:")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = df_layer2.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = pd.to_numeric(df_layer2["Second_Layer_Score"], errors='coerce')


KeyError: "None of [Index([1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,\n       1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,\n       1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,\n       0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],\n      dtype='float64')] are in the [columns]"

In [None]:
# import pandas as pd
# from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, classification_report
# from fuzzywuzzy import fuzz
# import re
# import numpy as np
# import pickle
# # from BB import BertForSTS

# # threshold = 0.80
# # threshold1 = 0.65



# ##------------------------------------Fuzzy Wuzzy Layer-------------------------------------------------------------------------------------------------------

# def convert_to_lower(name):
#     return name.lower()

# def remove_extra_spaces(name):
#     return re.sub(r'\s+', ' ', name).strip()

# def remove_stop_words(text):
#     # stop_words = ['devi', 'dei', 'debi', 'kumar', 'kumaar', 'kumari', 'kumaari', 'kmr', 'kumr', 'bhai', 'bhau', 'bai', 'ben', 'singh', 'kaur', 'Md', 'Mohd', 'Mohammad', 'Mohamad','alam','shekh','sek']
#     stop_words = ['devi', 'dei', 'debi', 'kmr', 'kumr', 'bhai', 'bhau', 'bai', 'ben', 'kaur', 'Md', 'Mohd', 'Mohammad', 'Mohamad','alam','shekh','sek']
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered_words)


# def preprocess_layer1(name):
#     name = convert_to_lower(name)
#     name = remove_extra_spaces(name)
#     name = remove_stop_words(name)
#     return name

# def check_keywords_layer1(name1, name2):
#     found_in_name1 = any(keyword in name1.lower() for keyword in KEYWORDS)
#     found_in_name2 = any(keyword in name2.lower() for keyword in KEYWORDS)
    
#     if found_in_name1 and found_in_name2:
#         return 1  
#     elif found_in_name1 or found_in_name2:
#         return 0 
#     return 1  
# def check_permutation_match(name1, name2):
#     name1 = preprocess_layer1(name1).replace(" ", "")
#     name2 = preprocess_layer1(name2).replace(" ", "")
    
#     return sorted(name1) == sorted(name2)

# def calculate_fuzzy_similarity_layer1(name1, name2):
#     name1 = preprocess_layer1(name1)
#     name2 = preprocess_layer1(name2)
    
#     fuzzy_ratio = fuzz.ratio(name1, name2) / 100.0
#     fuzzy_partial_ratio = fuzz.partial_ratio(name1, name2) / 100.0
#     fuzzy_token_sort_ratio = fuzz.token_sort_ratio(name1, name2) / 100.0
#     fuzzy_token_set_ratio = fuzz.token_set_ratio(name1, name2) / 100.0

#     fuzzy_similarity = (fuzzy_ratio + fuzzy_partial_ratio + fuzzy_token_sort_ratio + fuzzy_token_set_ratio) / 4.0
#     return fuzzy_similarity

# def check_substring_match(name1, name2):
#     name1 = preprocess_layer1(name1)
#     name2 = preprocess_layer1(name2)
#     words1 = set(name1.lower().split())
#     words2 = set(name2.lower().split())
    
#     if words1.issubset(words2) or words2.issubset(words1):
#         return 1
#     return 0

# def fuzzy_layer1(row):
#     name1 = row['name1']
#     name2 = row['name2']
    
#     if check_substring_match(name1, name2):
#         keyword_flag = check_keywords_layer1(name1, name2)
#         return 1 if keyword_flag == 1 else 0  # If keywords conflict, mark as not matching
    
#     if check_permutation_match(name1, name2):
#         keyword_flag = check_keywords_layer1(name1, name2)
#         return 1 if keyword_flag == 1 else 0  # If keywords conflict, mark as not matching
    
#     fuzzy_SS = calculate_fuzzy_similarity_layer1(name1, name2)
#     fuzzy_flag = fuzzy_SS >= 0.80  # Consider fuzzy similarity >= 80% as a match
    
#     if fuzzy_flag:
#         keyword_flag = check_keywords_layer1(name1, name2)
#         return 1 if keyword_flag == 1 else 0  # If keywords conflict, mark as not matching

#     return 0



# #-----------------------------------------Data Preprocessing anf Framework--------------------------------------------------------------------------------------------------


# from transformers import AutoModel, AutoTokenizer
# import torch
# from sklearn.metrics.pairwise import cosine_similarity
# from Levenshtein import distance as levenshtein_distance
# import jellyfish

# SPECIAL_CHAR_DOT_REGEX = r"[.]"
# SPECIAL_CHARS_REGEX = r"[-+.^:,_/\s]+" 
# SALUTATION_REGEX = r"^(shree|shri|miss|smt|mrs|mr|ms|dr|master|hon|sir|madam|prof|capt|major|rev|fr|br)\s*"
# PARENT_SPOUSE_NAME_REGEX = r"(?:\s*(?:s/o|d/o|w/o|so|do|wo|daughter of|son of|wife of|husband of)\s*)"
# COMMON_MUSLIM_SALUTATIONS_MOHAMMAD_REGEX = r"\b(mohammad|mohammed|muhamed|mohd|mohamed|mohamad|muhamad|muhammad|muhammed|muhammet|mohamud|mohummad|mohummed|mouhamed|muhamaad|mohammod|mouhamad|mo|md|mahmood|mahmud|ahmad|ahmed|hameed|hamid|hammed|mahd|mahmod|mohd|mouhammed|mohamad|muhmood|mohhammed|muhmamed|mohmed|mohmat|muhmat|mu|m|shaikh|mo)\b"
# LAST_NAMES_AGARWAL_VARIANTS_REGEX = r"\b(aggarwal|agrawal|agarwal|aggrawal|agarwalla|agarwal)\b"


# def convert_to_lower(name):
#     return name.lower()

# def replace_adjacent_duplicates(value):
#     if isinstance(value, str):
#         return re.sub(r'(.)\1+', r'\1', value)
#     return value

# def replace_characters(name):
#     replacements = {'e': 'i', 'j': 'z', 'v': 'w', 'q': 'k'}
#     for old, new in replacements.items():
#         name = name.replace(old, new)
#     return name

# def replace_bigrams(name):
#     replacements = {'ph': 'f', 'gh': 'g', 'th': 't', 'kh': 'k', 'dh': 'd', 'ch': 'c', 'sh': 's', 'au': 'o',
#                     'bh': 'b', 'ks': 'x', 'ck': 'k', 'ah': 'h', 'wh': 'w', 'wr': 'r'}
#     for old, new in replacements.items():
#         name = name.replace(old, new)
#     return name

# def remove_extra_spaces(name):
#     return re.sub(r'\s+', ' ', name).strip()

# def remove_consonant_a(name):
#     consonants = 'bcdfghjklmnpqrstvwxyz'
#     new_name = ''.join([name[i] for i in range(len(name)) if not (i > 0 and name[i] == 'a' and name[i - 1].lower() in consonants)])
#     return new_name

# def remove_special_characters(text):
#     text = re.sub(SPECIAL_CHAR_DOT_REGEX, '', text)
#     text = re.sub(SPECIAL_CHARS_REGEX, '', text)
#     return text.strip()

# def remove_salutations(text):
#     return re.sub(SALUTATION_REGEX, '', text, flags=re.IGNORECASE).strip()

# def remove_parent_spouse_name(text):
#     return re.sub(r'\s*(?:s/o|d/o|w/o|so|do|wo|daughter of|son of|wife of|husband of|daughter|son|child of)\s*[\w\s,.]*$', '', text, flags=re.IGNORECASE).strip()

# def remove_common_muslim_variations(text):
#     return re.sub(COMMON_MUSLIM_SALUTATIONS_MOHAMMAD_REGEX, '', text, flags=re.IGNORECASE).strip()

# def remove_agarwal_variants(text):
#     return re.sub(LAST_NAMES_AGARWAL_VARIANTS_REGEX, '', text, flags=re.IGNORECASE).strip()

# def remove_stop_words(text):
#     stop_words = ['devi', 'dei', 'debi', 'kmr', 'kumr', 'bhai', 'bhau', 'bai', 'ben', 'kaur', 'Md', 'Mohd', 'Mohammad', 'Mohamad','alam','shekh','sek']
#     # stop_words = ['devi', 'dei', 'debi', 'kumar', 'kumaar', 'kumari', 'kumaari', 'kmr', 'kumr', 'bhai', 'bhau', 'bai', 'ben', 'singh', 'kaur', 'Md', 'Mohd', 'Mohammad', 'Mohamad']
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered_words)

# def preprocess(name):
#     name = remove_salutations(name)
#     name = remove_parent_spouse_name(name)
#     name = remove_common_muslim_variations(name)
#     name = remove_agarwal_variants(name)
#     name = convert_to_lower(name)
#     name = replace_adjacent_duplicates(name)
#     name = replace_characters(name)
#     name = replace_bigrams(name)
#     name = remove_consonant_a(name)
#     name = remove_special_characters(name)
#     name = remove_extra_spaces(name)
#     name = remove_stop_words(name)
#     return name


# import pickle
# with open('model.pkl', 'rb') as file:
#     model = pickle.load(file)
    
# with open('tokenizer.pkl', 'rb') as file:
#     tokenizer = pickle.load(file)
    

# def get_embedding(text):
#     inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
#     with torch.no_grad():
#         embeddings = model(**inputs).last_hidden_state.mean(dim=1)
#     return embeddings


# # Similarity functions
# def calculate_cosine_similarity(embedding1, embedding2):
#     return cosine_similarity(embedding1, embedding2).item()

# def calculate_levenshtein_similarity(name1, name2):
#     lev_distance = levenshtein_distance(name1, name2)
#     max_len = max(len(name1), len(name2))
#     return (max_len - lev_distance) / max_len if max_len > 0 else 1.0

# def calculate_phonetic_similarity(name1, name2):
#     soundex1 = jellyfish.soundex(name1)
#     soundex2 = jellyfish.soundex(name2)
#     return jellyfish.jaro_winkler_similarity(soundex1, soundex2)

# def calculate_jaccard_similarity(name1, name2):
#     set1, set2 = set(name1), set(name2)
#     intersection, union = set1.intersection(set2), set1.union(set2)
#     return len(intersection) / len(union) if union else 1.0

# ##------------------------------------Calling Name_Match------------------------------------------------------------------------------------------------------
 

# def name_match(name1, name2):
#     name1_processed = preprocess(name1)
#     name2_processed = preprocess(name2)

#     embedding_similarity = predict_similarity_embedding_model([name1_processed, name2_processed])

#     levenshtein_similarity = calculate_levenshtein_similarity(name1_processed, name2_processed)
#     phonetic_similarity = calculate_phonetic_similarity(name1_processed, name2_processed)
#     jaccard_similarity = calculate_jaccard_similarity(name1_processed, name2_processed)

#     fuzzy_ratio = fuzz.ratio(name1_processed, name2_processed) / 100.0
#     fuzzy_partial_ratio = fuzz.partial_ratio(name1_processed, name2_processed) / 100.0
#     fuzzy_token_sort_ratio = fuzz.token_sort_ratio(name1_processed, name2_processed) / 100.0
#     fuzzy_token_set_ratio = fuzz.token_set_ratio(name1_processed, name2_processed) / 100.0

#     fuzzy_similarity = (fuzzy_ratio + fuzzy_partial_ratio + fuzzy_token_sort_ratio + fuzzy_token_set_ratio) / 4.0
    
    
# #     return {
# #         "embedding_similarity": embedding_similarity,
# #         "levenshtein_similarity": levenshtein_similarity,
# #         "phonetic_similarity": phonetic_similarity,
# #         "jaccard_similarity": jaccard_similarity,
# #         "fuzzy_similarity": fuzzy_similarity
# #     }
  

#     final_score = (
#         embedding_similarity * 0.121630 +
#         levenshtein_similarity * 0.205104 +
#         phonetic_similarity * 0.022287 +
#         jaccard_similarity * 0.262775 +
#         fuzzy_similarity * 0.388205
#     )
#     return final_score


# ##------------------------------------Fuzzy With Data Preprocessing and keyword matching-------------------------------------------------------------------------------------------------------

# SPECIAL_CHAR_DOT_REGEX = r"[.]"
# SPECIAL_CHARS_REGEX = r"[-+.^:,_/\s]+" 
# SALUTATION_REGEX = r"^(shree|shri|miss|smt|mrs|mr|ms|dr|master|hon|sir|madam|prof|capt|major|rev|fr|br)\s*"
# PARENT_SPOUSE_NAME_REGEX = r"(?:\s*(?:s/o|d/o|w/o|so|do|wo|daughter of|son of|wife of|husband of)\s*)"
# COMMON_MUSLIM_SALUTATIONS_MOHAMMAD_REGEX = r"\b(mohammad|mohammed|muhamed|mohd|mohamed|mohamad|muhamad|muhammad|muhammed|muhammet|mohamud|mohummad|mohummed|mouhamed|muhamaad|mohammod|mouhamad|mo|md|mahmood|mahmud|ahmad|ahmed|hameed|hamid|hammed|mahd|mahmod|mohd|mouhammed|mohamad|muhmood|mohhammed|muhmamed|mohmed|mohmat|muhmat|mu|m|shaikh|mo)\b"
# LAST_NAMES_AGARWAL_VARIANTS_REGEX = r"\b(aggarwal|agrawal|agarwal|aggrawal|agarwalla|agarwal)\b"


# def convert_to_lower(name):
#     return name.lower()

# def remove_extra_spaces(name):
#     return re.sub(r'\s+', ' ', name).strip()

# def remove_common_muslim_variations(text):
#     return re.sub(COMMON_MUSLIM_SALUTATIONS_MOHAMMAD_REGEX, '', text, flags=re.IGNORECASE).strip()

# def remove_agarwal_variants(text):
#     return re.sub(LAST_NAMES_AGARWAL_VARIANTS_REGEX, '', text, flags=re.IGNORECASE).strip()

# def remove_stop_words(text):
#     # stop_words = ['devi', 'dei', 'debi', 'kumar', 'kumaar', 'kumari', 'kumaari', 'kmr', 'kumr', 'bhai', 'bhau', 'bai', 'ben', 'singh', 'kaur', 'Md', 'Mohd', 'Mohammad', 'Mohamad','alam','shekh','sek']
#     stop_words = ['devi', 'dei', 'debi', 'kmr', 'kumr', 'bhai', 'bhau', 'bai', 'ben', 'kaur', 'Md', 'Mohd', 'Mohammad', 'Mohamad','alam','shekh','sek']
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered_words)


# def preprocess_FuzzyWuzzy(name):
#     """Process a name through all the defined normalization steps."""
#     name = convert_to_lower(name)
#     name = remove_stop_words(name)
#     name = remove_common_muslim_variations(name)
#     name = remove_agarwal_variants(name)
#     return name

# def calculate_fuzzy_similarity_processed(name1, name2):
#     name1 = preprocess_FuzzyWuzzy(name1)
#     name2 = preprocess_FuzzyWuzzy(name2)
    
#     fuzzy_ratio = fuzz.ratio(name1, name2) / 100.0
#     fuzzy_partial_ratio = fuzz.partial_ratio(name1, name2) / 100.0
#     fuzzy_token_sort_ratio = fuzz.token_sort_ratio(name1, name2) / 100.0
#     fuzzy_token_set_ratio = fuzz.token_set_ratio(name1, name2) / 100.0

#     fuzzy_similarity = (fuzzy_ratio + fuzzy_partial_ratio + fuzzy_token_sort_ratio + fuzzy_token_set_ratio) / 4.0
#     return fuzzy_similarity


# def check_keywords_in_names(name1, name2):
#     """Check for presence of keywords in either or both names."""
#     found_in_name1 = any(keyword in name1.lower() for keyword in KEYWORDS)
#     found_in_name2 = any(keyword in name2.lower() for keyword in KEYWORDS)
    
#     if found_in_name1 and found_in_name2:
#         return 1  
#     elif found_in_name1 or found_in_name2:
#         return 0  
#     return 1


# def process_false_cases(row):
#     name1 = row['name1']
#     name2 = row['name2']

#     fuzzy_SS = calculate_fuzzy_similarity_processed(name1, name2)

#     fuzzy_flag = 1 if fuzzy_SS >= 0.70 else 0

#     if fuzzy_flag:
#         fuzzy_flag = check_keywords_in_names(name1, name2)

#     return 1 if fuzzy_flag else 0


In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def calculate_metrics(df):
    y_true = df['labels']  # True labels
    y_pred = df['Prediction']  # Predicted labels
    y_scores = df['Prediction'].astype(float)  # Convert integers to float for AUC calculation
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }
    return metrics

def process_name_matching(file_path):
    # Load data for first layer
    df_layer1 = pd.read_csv(file_path)
    threshold = 0.80
    threshold1 = 0.65

    df_layer1["Prediction"] = 0  

    # First Layer
    df_layer1["First_Layer_Pass"] = df_layer1.apply(fuzzy_layer1, axis=1)
    df_layer1["Prediction"] = df_layer1.apply(lambda x: 1 if x["First_Layer_Pass"] else x["Prediction"], axis=1)
    
    # Save first layer result
    df_layer1.to_csv("first_audit.csv", index=False)
    first_layer_metrics = calculate_metrics(df_layer1)

    # Second Layer
    df_layer2 = df_layer1[df_layer1["Prediction"] == 0]  # Filter rows where Prediction == 0
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"].astype(int)
    
    # Save second layer result
    df_layer2.to_csv("second_audit.csv", index=False)
    second_layer_metrics = calculate_metrics(df_layer2)

    # Third Layer
    df_layer3 = df_layer2[df_layer2["Prediction"] == 0]  # Filter rows where Prediction == 0
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
    df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"].astype(int)
    
    # Save third layer result
    df_layer3.to_csv("third_audit.csv", index=False)
    third_layer_metrics = calculate_metrics(df_layer3)
    
    # Combine results
    df_combined = pd.concat([
        df_layer1[df_layer1["Prediction"] == 1],
        df_layer2[df_layer2["Prediction"] == 1],
        df_layer3[df_layer3["Prediction"] == 1]
    ], ignore_index=True)

    # Save combined result to a CSV file
    df_combined.to_csv("combined_results.csv", index=False)

    # Return results
    return first_layer_metrics, second_layer_metrics, third_layer_metrics, df_combined

if __name__ == "__main__":
    first_metrics, second_metrics, third_metrics, combined_data = process_name_matching("one_week.csv")

    print("\nConfusion Matrix for Each Layer:")

    # First Layer Confusion Matrix
    print("\nFirst Layer Confusion Matrix:")
    first_cm = first_metrics["confusion_matrix"]
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {first_metrics['accuracy']}")
    print(f"Precision: {first_metrics['precision']}")
    print(f"Recall: {first_metrics['recall']}")
    print(f"F1 Score: {first_metrics['f1_score']}")
    print(f"AUC: {first_metrics['roc_auc_score']}")

    # Second Layer Confusion Matrix
    print("\nSecond Layer Confusion Matrix:")
    second_cm = second_metrics["confusion_matrix"]
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {second_metrics['accuracy']}")
    print(f"Precision: {second_metrics['precision']}")
    print(f"Recall: {second_metrics['recall']}")
    print(f"F1 Score: {second_metrics['f1_score']}")
    print(f"AUC: {second_metrics['roc_auc_score']}")

    # Third Layer Confusion Matrix
    print("\nThird Layer Confusion Matrix:")
    third_cm = third_metrics["confusion_matrix"]
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {third_metrics['accuracy']}")
    print(f"Precision: {third_metrics['precision']}")
    print(f"Recall: {third_metrics['recall']}")
    print(f"F1 Score: {third_metrics['f1_score']}")
    print(f"AUC: {third_metrics['roc_auc_score']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Prediction"


Confusion Matrix for Each Layer:

First Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5175            4
Actual 1          559         4262
Accuracy: 0.9437
Precision: 0.9990623534927332
Recall: 0.8840489524994815
F1 Score: 0.9380433586442171
AUC: 0.9416383013124942

Second Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5169            6
Actual 1          177          382
Accuracy: 0.9680851063829787
Precision: 0.9845360824742269
Recall: 0.6833631484794276
F1 Score: 0.8067581837381204
AUC: 0.8411018640947863

Third Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5131           38
Actual 1          126           51
Accuracy: 0.9693228582117471
Precision: 0.5730337078651685
Recall: 0.288135593220339
F1 Score: 0.38345864661654133
AUC: 0.6403920372756754


## close 

In [16]:
df_combined.shape

NameError: name 'df_combined' is not defined

In [None]:
df_combined['Prediction'].value_counts()

In [52]:
df_combined.shape

(210055, 17)

In [53]:
df_combined['Prediction'].value_counts()

Prediction
True     114504
False     95551
Name: count, dtype: int64

In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def calculate_metrics(df):
    y_true = df['labels']  # True labels
    y_pred = df['Prediction']  # Predicted labels
    y_scores = df['Prediction'].astype(float)  # Convert boolean to float for AUC calculation
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }
    return metrics

def process_name_matching(file_path):
    # Load data for first layer
    df_layer1 = pd.read_csv(file_path)
    threshold = 0.80
    threshold1 = 0.65

    # First Layer
    df["First_Layer_Pass"] = df.apply(fuzzy_layer1, axis=1)
    df["Prediction"] = df.apply(lambda x: 1 if x["First_Layer_Pass"] else x["Prediction"], axis=1)
    
    # Save first layer result
    df_layer1.to_csv("first_audit.csv", index=False)
    first_layer_metrics = calculate_metrics(df_layer1)

    # Second Layer
    df_layer2 = df_layer1[df_layer1["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"]
    
    # Save second layer result
    df_layer2.to_csv("second_audit.csv", index=False)
    second_layer_metrics = calculate_metrics(df_layer2)

    # Third Layer
    df_layer3 = df_layer2[df_layer2["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
    df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]
    
    # Save third layer result
    df_layer3.to_csv("third_audit.csv", index=False)
    third_layer_metrics = calculate_metrics(df_layer3)
    
    # Combine results
    df_combined = pd.concat([
        df_layer1[df_layer1["Prediction"]],
        df_layer2[df_layer2["Prediction"]],
        df_layer3[df_layer3["Prediction"]]
    ], ignore_index=True)

    # Save combined result to a CSV file
    df_combined.to_csv("combined_results.csv", index=False)

    # Return results
    return first_layer_metrics, second_layer_metrics, third_layer_metrics, df_combined

if __name__ == "__main__":
    first_metrics, second_metrics, third_metrics, combined_data = process_name_matching("combined_data_10k.csv")

    print("\nConfusion Matrix for Each Layer:")

    # First Layer Confusion Matrix
    print("\nFirst Layer Confusion Matrix:")
    first_cm = first_metrics["confusion_matrix"]
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {first_metrics['accuracy']}")
    print(f"Precision: {first_metrics['precision']}")
    print(f"Recall: {first_metrics['recall']}")
    print(f"F1 Score: {first_metrics['f1_score']}")
    print(f"AUC: {first_metrics['roc_auc_score']}")

    # Second Layer Confusion Matrix
    print("\nSecond Layer Confusion Matrix:")
    second_cm = second_metrics["confusion_matrix"]
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {second_metrics['accuracy']}")
    print(f"Precision: {second_metrics['precision']}")
    print(f"Recall: {second_metrics['recall']}")
    print(f"F1 Score: {second_metrics['f1_score']}")
    print(f"AUC: {second_metrics['roc_auc_score']}")

    # Third Layer Confusion Matrix
    print("\nThird Layer Confusion Matrix:")
    third_cm = third_metrics["confusion_matrix"]
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {third_metrics['accuracy']}")
    print(f"Precision: {third_metrics['precision']}")
    print(f"Recall: {third_metrics['recall']}")
    print(f"F1 Score: {third_metrics['f1_score']}")
    print(f"AUC: {third_metrics['roc_auc_score']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Prediction"


Confusion Matrix for Each Layer:

First Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5178            1
Actual 1         1089         3732
Accuracy: 0.891
Precision: 0.999732118939191
Recall: 0.7741132545115121
F1 Score: 0.8725742342763619
AUC: 0.8869600835214445

Second Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5177            1
Actual 1          339          750
Accuracy: 0.9457475666187969
Precision: 0.9986684420772304
Recall: 0.6887052341597796
F1 Score: 0.8152173913043478
AUC: 0.844256054700593

Third Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5177            0
Actual 1          332            7
Accuracy: 0.939811457577955
Precision: 1.0
Recall: 0.02064896755162242
F1 Score: 0.04046242774566474
AUC: 0.5103244837758112


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]


In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def calculate_metrics(df):
    y_true = df['labels']  # True labels
    y_pred = df['Prediction']  # Predicted labels
    y_scores = df['Prediction'].astype(float)  # Convert boolean to float for AUC calculation
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }
    return metrics

def process_name_matching(file_path):
    # Load data for first layer
    df_layer1 = pd.read_csv(file_path)
    threshold = 0.80
    threshold1 = 0.65

    # First Layer
    df_layer1["First_Layer_Score"] = df_layer1.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df_layer1["First_Layer_Pass"] = df_layer1["First_Layer_Score"] >= threshold
    df_layer1["Prediction"] = df_layer1["First_Layer_Pass"]
    
    # Save first layer result
    df_layer1.to_csv("first_audit.csv", index=False)
    first_layer_metrics = calculate_metrics(df_layer1)

    # Second Layer
    df_layer2 = df_layer1[df_layer1["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"]
    
    # Save second layer result
    df_layer2.to_csv("second_audit.csv", index=False)
    second_layer_metrics = calculate_metrics(df_layer2)

    # Third Layer
    df_layer3 = df_layer2[df_layer2["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
    df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]
    
    # Save third layer result
    df_layer3.to_csv("third_audit.csv", index=False)
    third_layer_metrics = calculate_metrics(df_layer3)
    
    # Combine results
    df_combined = pd.concat([
        df_layer1[df_layer1["Prediction"]],
        df_layer2[df_layer2["Prediction"]],
        df_layer3[df_layer3["Prediction"]]
    ], ignore_index=True)

    # Save combined result to a CSV file
    df_combined.to_csv("combined_results.csv", index=False)

    # Return results
    return first_layer_metrics, second_layer_metrics, third_layer_metrics, df_combined

if __name__ == "__main__":
    first_metrics, second_metrics, third_metrics, combined_data = process_name_matching("combined_data_10k.csv")

    print("\nConfusion Matrix for Each Layer:")

    # First Layer Confusion Matrix
    print("\nFirst Layer Confusion Matrix:")
    first_cm = first_metrics["confusion_matrix"]
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {first_metrics['accuracy']}")
    print(f"Precision: {first_metrics['precision']}")
    print(f"Recall: {first_metrics['recall']}")
    print(f"F1 Score: {first_metrics['f1_score']}")
    print(f"AUC: {first_metrics['roc_auc_score']}")

    # Second Layer Confusion Matrix
    print("\nSecond Layer Confusion Matrix:")
    second_cm = second_metrics["confusion_matrix"]
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {second_metrics['accuracy']}")
    print(f"Precision: {second_metrics['precision']}")
    print(f"Recall: {second_metrics['recall']}")
    print(f"F1 Score: {second_metrics['f1_score']}")
    print(f"AUC: {second_metrics['roc_auc_score']}")

    # Third Layer Confusion Matrix
    print("\nThird Layer Confusion Matrix:")
    third_cm = third_metrics["confusion_matrix"]
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {third_metrics['accuracy']}")
    print(f"Precision: {third_metrics['precision']}")
    print(f"Recall: {third_metrics['recall']}")
    print(f"F1 Score: {third_metrics['f1_score']}")
    print(f"AUC: {third_metrics['roc_auc_score']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Prediction"


Confusion Matrix for Each Layer:

First Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5178            1
Actual 1         1089         3732
Accuracy: 0.891
Precision: 0.999732118939191
Recall: 0.7741132545115121
F1 Score: 0.8725742342763619
AUC: 0.8869600835214445

Second Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5177            1
Actual 1          339          750
Accuracy: 0.9457475666187969
Precision: 0.9986684420772304
Recall: 0.6887052341597796
F1 Score: 0.8152173913043478
AUC: 0.844256054700593

Third Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5164           13
Actual 1          221          118
Accuracy: 0.957577955039884
Precision: 0.9007633587786259
Recall: 0.3480825958702065
F1 Score: 0.5021276595744681
AUC: 0.6727857445257929


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]


In [17]:
df=pd.read_csv("combined_results_5.csv")

  df=pd.read_csv("combined_results_5.csv")


In [33]:
df1=pd.read_csv("fined_data_audit.csv")

In [34]:
df1.shape

(134826, 11)

In [None]:
df.shape

In [18]:
df['Prediction'].value_counts()

Prediction
True    116944
Name: count, dtype: int64

## ignore

In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def calculate_metrics(df):
    y_true = df['labels']  # True labels
    y_pred = df['Prediction']  # Predicted labels
    y_scores = df['Prediction'].astype(float)  # Convert boolean to float for AUC calculation
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }
    
    return metrics

def process_name_matching(file_path):
    # Load data for first layer
    df_layer1 = pd.read_csv(file_path)
    threshold = 0.75
    threshold1 = 0.65

    # First Layer
    df_layer1["First_Layer_Score"] = df_layer1.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df_layer1["First_Layer_Pass"] = df_layer1["First_Layer_Score"] >= threshold
    df_layer1["Prediction"] = df_layer1["First_Layer_Pass"]
    
    # Save first layer result
    df_layer1.to_csv("first_audit.csv", index=False)
    first_layer_metrics = calculate_metrics(df_layer1)

    # Second Layer
    df_layer2 = df_layer1[df_layer1["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"]
    
    # Save second layer result
    df_layer2.to_csv("second_audit.csv", index=False)
    second_layer_metrics = calculate_metrics(df_layer2)

    # Third Layer
    df_layer3 = df_layer2[df_layer2["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
    df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]
    
    # Save third layer result
    df_layer3.to_csv("third_audit.csv", index=False)
    third_layer_metrics = calculate_metrics(df_layer3)
    
    # Combine results
    df_combined = pd.concat([
        df_layer1[df_layer1["Prediction"]],
        df_layer2[df_layer2["Prediction"]],
        df_layer3[df_layer3["Prediction"]]
    ])

    # Final output
    df_layer1["Prediction"] = False
    df_layer1.loc[df_layer1[df_layer1["Prediction"]].index, "Prediction"] = True
    df_layer1.loc[df_layer2[df_layer2["Prediction"]].index, "Prediction"] = True
    df_layer1.loc[df_layer3[df_layer3["Prediction"]].index, "Prediction"] = True

    return first_layer_metrics, second_layer_metrics, third_layer_metrics, df_combined

if __name__ == "__main__":
    first_metrics, second_metrics, third_metrics, combined_data = process_name_matching("balanced_audit_data.csv")

    print("\nConfusion Matrix for Each Layer:")

    # First Layer Confusion Matrix
    print("\nFirst Layer Confusion Matrix:")
    first_cm = first_metrics["confusion_matrix"]
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {first_metrics['accuracy']}")
    print(f"Precision: {first_metrics['precision']}")
    print(f"Recall: {first_metrics['recall']}")
    print(f"F1 Score: {first_metrics['f1_score']}")
    print(f"AUC: {first_metrics['roc_auc_score']}")

    # Second Layer Confusion Matrix
    print("\nSecond Layer Confusion Matrix:")
    second_cm = second_metrics["confusion_matrix"]
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {second_metrics['accuracy']}")
    print(f"Precision: {second_metrics['precision']}")
    print(f"Recall: {second_metrics['recall']}")
    print(f"F1 Score: {second_metrics['f1_score']}")
    print(f"AUC: {second_metrics['roc_auc_score']}")

    # Third Layer Confusion Matrix
    print("\nThird Layer Confusion Matrix:")
    third_cm = third_metrics["confusion_matrix"]
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {third_metrics['accuracy']}")
    print(f"Precision: {third_metrics['precision']}")
    print(f"Recall: {third_metrics['recall']}")
    print(f"F1 Score: {third_metrics['f1_score']}")
    print(f"AUC: {third_metrics['roc_auc_score']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Prediction"


Confusion Matrix for Each Layer:

First Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         6998          100
Actual 1         2661         5339
Accuracy: 0.8171280964366141
Precision: 0.9816142673285531
Recall: 0.667375
F1 Score: 0.7945531661581963
AUC: 0.8266432621865315

Second Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         6892          106
Actual 1          834         1827
Accuracy: 0.9026814370017601
Precision: 0.9451629591308847
Recall: 0.6865839909808342
F1 Score: 0.7953852851545494
AUC: 0.83571840303543

Third Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         6880           12
Actual 1          629          205
Accuracy: 0.9170333937354388
Precision: 0.9447004608294931
Recall: 0.24580335731414868
F1 Score: 0.39010466222645096
AUC: 0.622031104077852


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]


In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def calculate_metrics(df):
    y_true = df['labels']  # True labels
    y_pred = df['Prediction']  # Predicted labels
    y_scores = df['Prediction'].astype(float)  # Convert boolean to float for AUC calculation
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }
    
    return metrics

def process_name_matching(file_path):
    # Load data for first layer
    df_layer1 = pd.read_csv(file_path)
    threshold = 0.80
    threshold1 = 0.65

    # First Layer
    df_layer1["First_Layer_Score"] = df_layer1.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df_layer1["First_Layer_Pass"] = df_layer1["First_Layer_Score"] >= threshold
    df_layer1["Prediction"] = df_layer1["First_Layer_Pass"]
    
    # Save first layer result
    df_layer1.to_csv("first_audit.csv", index=False)
    first_layer_metrics = calculate_metrics(df_layer1)

    # Second Layer
    df_layer2 = df_layer1[df_layer1["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"]
    
    # Save second layer result
    df_layer2.to_csv("second_audit.csv", index=False)
    second_layer_metrics = calculate_metrics(df_layer2)

    # Third Layer
    df_layer3 = df_layer2[df_layer2["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
    df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]
    
    # Save third layer result
    df_layer3.to_csv("third_audit.csv", index=False)
    third_layer_metrics = calculate_metrics(df_layer3)
    
    # Combine results
    df_combined = pd.concat([
        df_layer1[df_layer1["Prediction"]],
        df_layer2[df_layer2["Prediction"]],
        df_layer3[df_layer3["Prediction"]]
    ])

    # Final output
    df_layer1["Prediction"] = False
    df_layer1.loc[df_layer1[df_layer1["Prediction"]].index, "Prediction"] = True
    df_layer1.loc[df_layer2[df_layer2["Prediction"]].index, "Prediction"] = True
    df_layer1.loc[df_layer3[df_layer3["Prediction"]].index, "Prediction"] = True

    return first_layer_metrics, second_layer_metrics, third_layer_metrics, df_combined

if __name__ == "__main__":
    first_metrics, second_metrics, third_metrics, combined_data = process_name_matching("balanced_audit_data.csv")

    print("\nConfusion Matrix for Each Layer:")

    # First Layer Confusion Matrix
    print("\nFirst Layer Confusion Matrix:")
    first_cm = first_metrics["confusion_matrix"]
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {first_metrics['accuracy']}")
    print(f"Precision: {first_metrics['precision']}")
    print(f"Recall: {first_metrics['recall']}")
    print(f"F1 Score: {first_metrics['f1_score']}")
    print(f"AUC: {first_metrics['roc_auc_score']}")

    # Second Layer Confusion Matrix
    print("\nSecond Layer Confusion Matrix:")
    second_cm = second_metrics["confusion_matrix"]
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {second_metrics['accuracy']}")
    print(f"Precision: {second_metrics['precision']}")
    print(f"Recall: {second_metrics['recall']}")
    print(f"F1 Score: {second_metrics['f1_score']}")
    print(f"AUC: {second_metrics['roc_auc_score']}")

    # Third Layer Confusion Matrix
    print("\nThird Layer Confusion Matrix:")
    third_cm = third_metrics["confusion_matrix"]
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {third_metrics['accuracy']}")
    print(f"Precision: {third_metrics['precision']}")
    print(f"Recall: {third_metrics['recall']}")
    print(f"F1 Score: {third_metrics['f1_score']}")
    print(f"AUC: {third_metrics['roc_auc_score']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Prediction"


Confusion Matrix for Each Layer:

First Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         7054           44
Actual 1         3062         4938
Accuracy: 0.7942773877334747
Precision: 0.9911682055399438
Recall: 0.61725
F1 Score: 0.7607456478200586
AUC: 0.8055255353620738

Second Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         6908          146
Actual 1         1028         2034
Accuracy: 0.883946223803875
Precision: 0.9330275229357798
Recall: 0.6642717178314826
F1 Score: 0.7760396795116368
AUC: 0.8217871206112333

Third Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         6884           24
Actual 1          653          375
Accuracy: 0.9146925403225806
Precision: 0.9398496240601504
Recall: 0.3647859922178988
F1 Score: 0.5255781359495444
AUC: 0.6806558797221515


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]


In [38]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def calculate_metrics(df):
    y_true = df['labels']
    y_pred = df['Prediction']
    y_scores = df['Prediction'].astype(float)
    
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }

def process_name_matching(file_path, thresholds, output_dir="output"):
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Load data
    df = pd.read_csv(file_path)
    results = []
    combined_predictions = pd.DataFrame()

    for threshold in thresholds:
        # Apply threshold
        df[f"Score_{threshold}"] = df.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
        df["Prediction"] = df[f"Score_{threshold}"] >= threshold
        
        # Calculate metrics
        metrics = calculate_metrics(df)
        metrics["threshold"] = threshold
        results.append(metrics)

        # Save layer-wise data
        df.to_csv(f"{output_dir}/results_threshold_{int(threshold * 100)}.csv", index=False)

        # Combine passing rows for final prediction
        combined_predictions = pd.concat([combined_predictions, df[df["Prediction"]]])

    # Save combined data
    combined_predictions.to_csv(f"{output_dir}/combined_predictions.csv", index=False)

    return results, combined_predictions

if __name__ == "__main__":
    thresholds = [0.60, 0.65, 0.70,0.75, 0.80, 0.85, 0.90, 0.95, 1]  # Define thresholds for evaluation
    metrics, combined_data = process_name_matching("combined_data_20k.csv", thresholds)

    print("\nMetrics Summary:")
    for metric in metrics:
        print(f"\nThreshold: {metric['threshold']}")
        print(f"Accuracy: {metric['accuracy']}")
        print(f"Precision: {metric['precision']}")
        print(f"Recall: {metric['recall']}")
        print(f"F1 Score: {metric['f1_score']}")
        print(f"AUC: {metric['roc_auc_score']}")
        print("Confusion Matrix:")
        print(pd.DataFrame(
            metric["confusion_matrix"],
            index=["Actual 0", "Actual 1"],
            columns=["Predicted 0", "Predicted 1"]
        ))



Metrics Summary:

Threshold: 0.6
Accuracy: 0.9173739147400254
Precision: 0.956370534302877
Recall: 0.8763882182520522
F1 Score: 0.9146341463414634
AUC: 0.917793990864471
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         9733          414
Actual 1         1280         9075

Threshold: 0.65
Accuracy: 0.8788898644034728
Precision: 0.9758220502901354
Recall: 0.7795267986479961
F1 Score: 0.8666988779728351
AUC: 0.8799082697290439
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         9947          200
Actual 1         2283         8072

Threshold: 0.7
Accuracy: 0.8651351087698761
Precision: 0.9896774193548387
Recall: 0.7407049734427813
F1 Score: 0.8472797569732119
AUC: 0.866410434883409
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0        10067           80
Actual 1         2685         7670

Threshold: 0.75
Accuracy: 0.848990342405619
Precision: 0.997805513646962
Recall: 0.7025591501690005
F1 Score: 0.824549472968378
AUC: 0.85049116471690

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def calculate_metrics(df):
    y_true = df['labels']  # True labels
    y_pred = df['Prediction']  # Predicted labels
    y_scores = df['Prediction'].astype(float)  # Convert boolean to float for AUC calculation
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }
    
    return metrics

def process_name_matching(file_path):
    # Load data for first layer
    df_layer1 = pd.read_csv(file_path)
    threshold = 0.80
    threshold1 = 0.65

    # First Layer
    df_layer1["First_Layer_Score"] = df_layer1.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df_layer1["First_Layer_Pass"] = df_layer1["First_Layer_Score"] >= threshold
    df_layer1["Prediction"] = df_layer1["First_Layer_Pass"]
    
    # Save first layer result
    df_layer1.to_csv("first_layer_results_20k.csv", index=False)
    first_layer_metrics = calculate_metrics(df_layer1)

    # Second Layer
    df_layer2 = df_layer1[df_layer1["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"]
    
    # Save second layer result
    df_layer2.to_csv("second_layer_results_20k.csv", index=False)
    second_layer_metrics = calculate_metrics(df_layer2)

    # Third Layer
    df_layer3 = df_layer2[df_layer2["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
    df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]
    
    # Save third layer result
    df_layer3.to_csv("third_layer_results_20k.csv", index=False)
    third_layer_metrics = calculate_metrics(df_layer3)
    
    # Combine results
    df_combined = pd.concat([
        df_layer1[df_layer1["Prediction"]],
        df_layer2[df_layer2["Prediction"]],
        df_layer3[df_layer3["Prediction"]]
    ])

    # Final output
    df_layer1["Prediction"] = False
    df_layer1.loc[df_layer1[df_layer1["Prediction"]].index, "Prediction"] = True
    df_layer1.loc[df_layer2[df_layer2["Prediction"]].index, "Prediction"] = True
    df_layer1.loc[df_layer3[df_layer3["Prediction"]].index, "Prediction"] = True

    return first_layer_metrics, second_layer_metrics, third_layer_metrics, df_combined

if __name__ == "__main__":
    first_metrics, second_metrics, third_metrics, combined_data = process_name_matching("combined_data_20k.csv")

    print("\nConfusion Matrix for Each Layer:")

    # First Layer Confusion Matrix
    print("\nFirst Layer Confusion Matrix:")
    first_cm = first_metrics["confusion_matrix"]
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {first_metrics['accuracy']}")
    print(f"Precision: {first_metrics['precision']}")
    print(f"Recall: {first_metrics['recall']}")
    print(f"F1 Score: {first_metrics['f1_score']}")
    print(f"AUC: {first_metrics['roc_auc_score']}")

    # Second Layer Confusion Matrix
    print("\nSecond Layer Confusion Matrix:")
    second_cm = second_metrics["confusion_matrix"]
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {second_metrics['accuracy']}")
    print(f"Precision: {second_metrics['precision']}")
    print(f"Recall: {second_metrics['recall']}")
    print(f"F1 Score: {second_metrics['f1_score']}")
    print(f"AUC: {second_metrics['roc_auc_score']}")

    # Third Layer Confusion Matrix
    print("\nThird Layer Confusion Matrix:")
    third_cm = third_metrics["confusion_matrix"]
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {third_metrics['accuracy']}")
    print(f"Precision: {third_metrics['precision']}")
    print(f"Recall: {third_metrics['recall']}")
    print(f"F1 Score: {third_metrics['f1_score']}")
    print(f"AUC: {third_metrics['roc_auc_score']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Prediction"


Confusion Matrix for Each Layer:

First Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0        10147            0
Actual 1         3693         6662
Accuracy: 0.8198712320749195
Precision: 1.0
Recall: 0.6433606953162724
F1 Score: 0.7829817241581948
AUC: 0.8216803476581362

Second Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0        10146            1
Actual 1          807         2886
Accuracy: 0.9416184971098266
Precision: 0.9996536196744025
Recall: 0.7814784727863525
F1 Score: 0.8772036474164133
AUC: 0.8906899607452016

Third Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0        10146            0
Actual 1          402          405
Accuracy: 0.9632977266502328
Precision: 1.0
Recall: 0.5018587360594795
F1 Score: 0.6683168316831682
AUC: 0.7509293680297398


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]


In [15]:
df_20k=pd.read_csv("combined_data_20k.csv")

In [17]:
df_20k['labels'].value_counts()

labels
1    10355
0    10147
Name: count, dtype: int64

## each layer result along with the csv file 

In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def calculate_metrics(df):
    y_true = df['labels']  # True labels
    y_pred = df['Prediction']  # Predicted labels
    y_scores = df['Prediction'].astype(float)  # Convert boolean to float for AUC calculation
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist()
    }
    
    return metrics

def process_name_matching(file_path):
    # Load data for first layer
    df_layer1 = pd.read_csv(file_path)
    threshold = 0.80
    threshold1 = 0.65

    # First Layer
    df_layer1["First_Layer_Score"] = df_layer1.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df_layer1["First_Layer_Pass"] = df_layer1["First_Layer_Score"] >= threshold
    df_layer1["Prediction"] = df_layer1["First_Layer_Pass"]
    
    # Save first layer result
    df_layer1.to_csv("first_layer_results_10k.csv", index=False)
    first_layer_metrics = calculate_metrics(df_layer1)

    # Second Layer
    df_layer2 = df_layer1[df_layer1["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"]
    
    # Save second layer result
    df_layer2.to_csv("second_layer_results_10k.csv", index=False)
    second_layer_metrics = calculate_metrics(df_layer2)

    # Third Layer
    df_layer3 = df_layer2[df_layer2["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
    df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]
    
    # Save third layer result
    df_layer3.to_csv("third_layer_results_10k.csv", index=False)
    third_layer_metrics = calculate_metrics(df_layer3)
    
    # Combine results
    df_combined = pd.concat([
        df_layer1[df_layer1["Prediction"]],
        df_layer2[df_layer2["Prediction"]],
        df_layer3[df_layer3["Prediction"]]
    ])

    # Final output
    df_layer1["Prediction"] = False
    df_layer1.loc[df_layer1[df_layer1["Prediction"]].index, "Prediction"] = True
    df_layer1.loc[df_layer2[df_layer2["Prediction"]].index, "Prediction"] = True
    df_layer1.loc[df_layer3[df_layer3["Prediction"]].index, "Prediction"] = True

    return first_layer_metrics, second_layer_metrics, third_layer_metrics, df_combined

if __name__ == "__main__":
    first_metrics, second_metrics, third_metrics, combined_data = process_name_matching("combined_data_10k.csv")

    print("\nConfusion Matrix for Each Layer:")

    # First Layer Confusion Matrix
    print("\nFirst Layer Confusion Matrix:")
    first_cm = first_metrics["confusion_matrix"]
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {first_metrics['accuracy']}")
    print(f"Precision: {first_metrics['precision']}")
    print(f"Recall: {first_metrics['recall']}")
    print(f"F1 Score: {first_metrics['f1_score']}")
    print(f"AUC: {first_metrics['roc_auc_score']}")

    # Second Layer Confusion Matrix
    print("\nSecond Layer Confusion Matrix:")
    second_cm = second_metrics["confusion_matrix"]
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {second_metrics['accuracy']}")
    print(f"Precision: {second_metrics['precision']}")
    print(f"Recall: {second_metrics['recall']}")
    print(f"F1 Score: {second_metrics['f1_score']}")
    print(f"AUC: {second_metrics['roc_auc_score']}")

    # Third Layer Confusion Matrix
    print("\nThird Layer Confusion Matrix:")
    third_cm = third_metrics["confusion_matrix"]
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy: {third_metrics['accuracy']}")
    print(f"Precision: {third_metrics['precision']}")
    print(f"Recall: {third_metrics['recall']}")
    print(f"F1 Score: {third_metrics['f1_score']}")
    print(f"AUC: {third_metrics['roc_auc_score']}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer2["Prediction"


Confusion Matrix for Each Layer:

First Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5178            1
Actual 1         1998         2823
Accuracy: 0.8001
Precision: 0.9996458923512748
Recall: 0.5855631611698817
F1 Score: 0.7385219097449313
AUC: 0.7926850368506292

Second Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5177            1
Actual 1          372         1626
Accuracy: 0.9480211817168339
Precision: 0.9993853718500307
Recall: 0.8138138138138138
F1 Score: 0.8971034482758621
AUC: 0.9068103445276099

Third Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5163           14
Actual 1          209          163
Accuracy: 0.9598125788430348
Precision: 0.9209039548022598
Recall: 0.4381720430107527
F1 Score: 0.5938069216757741
AUC: 0.7177338870645805


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]


In [18]:
df_10k = pd.read_csv("combined_data_10k.csv")

In [19]:
df_10k['labels'].value_counts()

labels
0    5179
1    4821
Name: count, dtype: int64

## without csv of each layer

In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    classification_report
)

def process_name_matching(file_path):
# def process_name_matching(file_path, num_rows=10):

    # Load data
    df = pd.read_csv(file_path)
    # df=df.head(10)
    # df = pd.read_csv(file_path, nrows=num_rows)
    threshold = 0.80
    threshold1 = 0.65


    df["First_Layer_Score"] = df.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df["First_Layer_Pass"] = df["First_Layer_Score"] >= threshold
    df_layer2 = df[~df["First_Layer_Pass"]].copy()

    
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer3 = df_layer2[~df_layer2["Second_Layer_Pass"]].copy()

    
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)



    df_combined = pd.concat([
        df[df["First_Layer_Pass"]],
        df_layer2[df_layer2["Second_Layer_Pass"]],
        df_layer3[df_layer3["Third_Layer_Pass"]],

    ])
    
    df_combined.to_csv("layer_combine.csv",index=False)
    df["Prediction"] = False
    df.loc[df["First_Layer_Pass"], "Prediction"] = True
    df.loc[df_layer2.index[df_layer2["Second_Layer_Pass"]], "Prediction"] = True
    df.loc[df_layer3.index[df_layer3["Third_Layer_Pass"]], "Prediction"] = True



    df["First_Layer_Score"] = df.get("First_Layer_Score", None)
    df["First_Layer_Pass"] = df.get("First_Layer_Pass", None)
    df["Second_Layer_Score"] = df_layer2.get("Second_Layer_Score", None)
    df["Second_Layer_Pass"] = df_layer2.get("Second_Layer_Pass", None)
    # df["Third_Layer_Pass"] = df_layer3.get("Third_Layer_Pass", None)
    
   


    df.to_csv('last_result.csv')

    
    y_true = df['labels']
    y_pred = df['Prediction']
    y_scores = df['Prediction'].astype(float) 


    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        "classification_report": classification_report(y_true, y_pred, output_dict=True)
    }

    correlation_matrix = df.select_dtypes(include=[np.number]).corr()

    print("Updated DataFrame with new predictions and indicator column:")
    print(df.head())


    return metrics, df, df_combined, correlation_matrix

if __name__ == "__main__":
    # nrows = 10 
    # metrics, full_data, combined_data, correlation_matrix = process_name_matching("20k_balaned_data.csv", num_rows=nrows)
    metrics, full_data, combined_data, correlation_matrix = process_name_matching("combined_data_20k.csv")
    cm = metrics["confusion_matrix"]
    cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
    print("Confusion Matrix:")
    print(cm_df)

    print("\nCorrelation Matrix:")
    print(correlation_matrix)

    print("\nMetrics:")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(f"F1 Score: {metrics['f1_score']}")
    print(f"AUC: {metrics['roc_auc_score']}")


Updated DataFrame with new predictions and indicator column:
         id merchant_type                 created_at     status docType  \
0  47566417   UNORGANIZED  2024-01-01 12:53:41+00:00  ACTIVATED  PAN_NO   
1  48465215   UNORGANIZED  2024-02-14 14:22:48+00:00  ACTIVATED  PAN_NO   
2  48036361   UNORGANIZED  2024-01-29 16:40:52+00:00  ACTIVATED  PAN_NO   
3  46063565   UNORGANIZED  2023-10-06 19:39:01+00:00  ACTIVATED  PAN_NO   
4  45539504           DIY  2023-09-12 19:00:36+00:00  ACTIVATED  PAN_NO   

               name                     pan_createdAt pan_status  \
0             DAILI         2024-01-01 07:36:53+00:00   APPROVED   
1  UMESH KUMAR NAIK  2024-02-14 09:00:02.613000+00:00   APPROVED   
2      KSHAMA GUPTA  2024-01-29 11:18:32.509000+00:00   APPROVED   
3      AFJAL ANSARI         2023-10-06 14:20:17+00:00   APPROVED   
4         AMIN ALAM         2023-09-12 13:38:57+00:00   APPROVED   

          beneficiary_name bank_status  ... dsFlag  szScore flScore   SCORE  \


In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

def calculate_metrics(file_path):
    df = pd.read_csv(file_path)
    y_true = df['labels']  # True labels
    y_pred = df['Prediction']  # Predicted labels
    y_scores = df['Prediction'].astype(float)  # Convert boolean to float for AUC calculation
    
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        "classification_report": classification_report(y_true, y_pred, output_dict=True)
    }
    
    return metrics

def process_name_matching(file_path):
    # Load data
    df = pd.read_csv(file_path)
    threshold = 0.80
    threshold1 = 0.65

    # First Layer
    df["First_Layer_Score"] = df.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df["First_Layer_Pass"] = df["First_Layer_Score"] >= threshold
    df["Prediction"] = df["First_Layer_Pass"]
    df.to_csv("first_layer_results.csv", index=False)
    first_layer_metrics = calculate_metrics("first_layer_results.csv")
    
    print("\nFirst Layer Metrics:")
    print(first_layer_metrics)
    first_cm = first_layer_metrics["confusion_matrix"]
    print("\nFirst Layer Confusion Matrix:")
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print("\nFirst Layer Metrics Report:")
    print(f"Accuracy: {first_layer_metrics['accuracy']}")
    print(f"Precision: {first_layer_metrics['precision']}")
    print(f"Recall: {first_layer_metrics['recall']}")
    print(f"F1 Score: {first_layer_metrics['f1_score']}")
    print(f"AUC: {first_layer_metrics['roc_auc_score']}")

    # Second Layer
    df_layer2 = pd.read_csv("first_layer_results.csv")  # Read first layer output
    df_layer2 = df_layer2[df_layer2["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
    df_layer2["Prediction"] = df_layer2["Second_Layer_Pass"]
    df_layer2.to_csv("second_layer_results.csv", index=False)
    second_layer_metrics = calculate_metrics("second_layer_results.csv")
    
    print("\nSecond Layer Metrics:")
    print(second_layer_metrics)
    second_cm = second_layer_metrics["confusion_matrix"]
    print("\nSecond Layer Confusion Matrix:")
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print("\nSecond Layer Metrics Report:")
    print(f"Accuracy: {second_layer_metrics['accuracy']}")
    print(f"Precision: {second_layer_metrics['precision']}")
    print(f"Recall: {second_layer_metrics['recall']}")
    print(f"F1 Score: {second_layer_metrics['f1_score']}")
    print(f"AUC: {second_layer_metrics['roc_auc_score']}")

    # Third Layer
    df_layer3 = pd.read_csv("second_layer_results.csv")  # Read second layer output
    df_layer3 = df_layer3[df_layer3["Prediction"] == False]  # Filter rows where Prediction == False
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)
    df_layer3["Prediction"] = df_layer3["Third_Layer_Pass"]
    df_layer3.to_csv("third_layer_results.csv", index=False)
    third_layer_metrics = calculate_metrics("third_layer_results.csv")
    
    print("\nThird Layer Metrics:")
    print(third_layer_metrics)
    third_cm = third_layer_metrics["confusion_matrix"]
    print("\nThird Layer Confusion Matrix:")
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print("\nThird Layer Metrics Report:")
    print(f"Accuracy: {third_layer_metrics['accuracy']}")
    print(f"Precision: {third_layer_metrics['precision']}")
    print(f"Recall: {third_layer_metrics['recall']}")
    print(f"F1 Score: {third_layer_metrics['f1_score']}")
    print(f"AUC: {third_layer_metrics['roc_auc_score']}")

    # Combine results
    df_first_layer = pd.read_csv("first_layer_results.csv")
    df_second_layer = pd.read_csv("second_layer_results.csv")
    df_third_layer = pd.read_csv("third_layer_results.csv")
    df_combined = pd.concat([
        df_first_layer[df_first_layer["Prediction"]],
        df_second_layer[df_second_layer["Prediction"]],
        df_third_layer[df_third_layer["Prediction"]]
    ])
    df_combined.to_csv("combined_layer_results.csv", index=False)

    # Final output
    df = pd.read_csv(file_path)  # Reload the original file
    df["Prediction"] = False
    df.loc[df_first_layer[df_first_layer["Prediction"]].index, "Prediction"] = True
    df.loc[df_second_layer[df_second_layer["Prediction"]].index, "Prediction"] = True
    df.loc[df_third_layer[df_third_layer["Prediction"]].index, "Prediction"] = True
    df.to_csv("final_results.csv", index=False)

    # Final Metrics
    final_metrics = calculate_metrics("final_results.csv")
    print("\nFinal Metrics:")
    print(final_metrics)
    final_cm = final_metrics["confusion_matrix"]
    print("\nFinal Confusion Matrix:")
    print(pd.DataFrame(final_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print("\nFinal Metrics Report:")
    print(f"Accuracy: {final_metrics['accuracy']}")
    print(f"Precision: {final_metrics['precision']}")
    print(f"Recall: {final_metrics['recall']}")
    print(f"F1 Score: {final_metrics['f1_score']}")
    print(f"AUC: {final_metrics['roc_auc_score']}")

    # Correlation Matrix for final results
    correlation_matrix = df.select_dtypes(include=[np.number]).corr()
    print("\nCorrelation Matrix:")
    print(correlation_matrix)

    return first_layer_metrics, second_layer_metrics, third_layer_metrics, final_metrics, df_combined


if __name__ == "__main__":
    first_metrics, second_metrics, third_metrics, final_metrics, combined_data = process_name_matching("combined_data_10k.csv")

    print("\nConfusion Matrix for Each Layer:")
    print("\nFirst Layer Confusion Matrix:")
    first_cm = first_metrics["confusion_matrix"]
    print(pd.DataFrame(first_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))

    print("\nSecond Layer Confusion Matrix:")
    second_cm = second_metrics["confusion_matrix"]
    print(pd.DataFrame(second_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))

    print("\nThird Layer Confusion Matrix:")
    third_cm = third_metrics["confusion_matrix"]
    print(pd.DataFrame(third_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))

    print("\nFinal Confusion Matrix:")
    final_cm = final_metrics["confusion_matrix"]
    print(pd.DataFrame(final_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))



First Layer Metrics:
{'accuracy': 0.8001, 'precision': 0.9996458923512748, 'recall': 0.5855631611698817, 'f1_score': 0.7385219097449313, 'roc_auc_score': 0.7926850368506292, 'confusion_matrix': [[5178, 1], [1998, 2823]], 'classification_report': {'0': {'precision': 0.7215719063545151, 'recall': 0.9998069125313768, 'f1-score': 0.8382031566167544, 'support': 5179.0}, '1': {'precision': 0.9996458923512748, 'recall': 0.5855631611698817, 'f1-score': 0.7385219097449313, 'support': 4821.0}, 'accuracy': 0.8001, 'macro avg': {'precision': 0.8606088993528949, 'recall': 0.7926850368506293, 'f1-score': 0.7883625331808428, 'support': 10000.0}, 'weighted avg': {'precision': 0.855631375003553, 'recall': 0.8001, 'f1-score': 0.7901468274998484, 'support': 10000.0}}}

First Layer Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         5178            1
Actual 1         1998         2823

First Layer Metrics Report:
Accuracy: 0.8001
Precision: 0.9996458923512748
Recall: 0.5855631611698817


## on 60k

In [28]:
## on unseen 60k data 

In [84]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    classification_report
)

def process_name_matching(file_path):
# def process_name_matching(file_path, num_rows=10):

    # Load data
    df = pd.read_csv(file_path)
    # df=df.head(10)
    # df = pd.read_csv(file_path, nrows=num_rows)
    threshold = 0.80
    threshold1 = 0.65


    df["First_Layer_Score"] = df.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df["First_Layer_Pass"] = df["First_Layer_Score"] >= threshold
    df_layer2 = df[~df["First_Layer_Pass"]].copy()

    
#     df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
#     df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1
#     df_layer3 = df_layer2[~df_layer2["Second_Layer_Pass"]].copy()

    
#     df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)

    df_layer2["Second_Layer_Score"] = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)

    # Determine if Second Layer passes
    df_layer2["Second_Layer_Pass"] = df_layer2["Second_Layer_Score"] >= threshold1

    # Filter out fail cases for the third layer
    df_layer3 = df_layer2[~df_layer2["Second_Layer_Pass"]].copy()

    # Validation: Ensure all fail cases are passed to the third layer
    fail_cases_in_layer2 = df_layer2[~df_layer2["Second_Layer_Pass"]]
    assert fail_cases_in_layer2.shape[0] == df_layer3.shape[0], "Not all fail cases are passed to the third layer!"
    assert set(fail_cases_in_layer2.index) == set(df_layer3.index), "Mismatch in row indices between failing cases and third layer!"

    # Process Third Layer on fail cases
    df_layer3["Third_Layer_Pass"] = df_layer3.apply(process_false_cases, axis=1)



    df_combined = pd.concat([
        df[df["First_Layer_Pass"]],
        df_layer2[df_layer2["Second_Layer_Pass"]],
        df_layer3[df_layer3["Third_Layer_Pass"]],

    ])
    
    df_combined.to_csv("weight_combine.csv",index=False)
    df["Prediction"] = False
    df.loc[df["First_Layer_Pass"], "Prediction"] = True
    df.loc[df_layer2.index[df_layer2["Second_Layer_Pass"]], "Prediction"] = True
    df.loc[df_layer3.index[df_layer3["Third_Layer_Pass"]], "Prediction"] = True



    df["First_Layer_Score"] = df.get("First_Layer_Score", None)
    df["First_Layer_Pass"] = df.get("First_Layer_Pass", None)
    df["Second_Layer_Score"] = df_layer2.get("Second_Layer_Score", None)
    df["Second_Layer_Pass"] = df_layer2.get("Second_Layer_Pass", None)
    df["Third_Layer_Pass"] = df_layer3.get("Third_Layer_Pass", None)
    
   


    df.to_csv('layer_weight.csv')

    
    y_true = df['labels']
    y_pred = df['Prediction']
    y_scores = df['Prediction'].astype(float) 


    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc_score": roc_auc_score(y_true, y_scores),
        "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(),
        "classification_report": classification_report(y_true, y_pred, output_dict=True)
    }

    correlation_matrix = df.select_dtypes(include=[np.number]).corr()

    print("Updated DataFrame with new predictions and indicator column:")
    print(df.head())


    return metrics, df, df_combined, correlation_matrix

if __name__ == "__main__":
    # nrows = 10 
    # metrics, full_data, combined_data, correlation_matrix = process_name_matching("20k_balaned_data.csv", num_rows=nrows)
    metrics, full_data, combined_data, correlation_matrix = process_name_matching("combined_data_60k.csv")
    cm = metrics["confusion_matrix"]
    cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
    print("Confusion Matrix:")
    print(cm_df)

    print("\nCorrelation Matrix:")
    print(correlation_matrix)

    print("\nMetrics:")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(f"F1 Score: {metrics['f1_score']}")
    print(f"AUC: {metrics['roc_auc_score']}")


  df = pd.read_csv(file_path)


Updated DataFrame with new predictions and indicator column:
         id merchant_type                 created_at     status docType  \
0  50255782   UNORGANIZED  2024-06-15 18:49:10+00:00  ACTIVATED  PAN_NO   
1  47708404           DIY  2024-01-08 21:42:28+00:00  ACTIVATED  PAN_NO   
2  48873693   UNORGANIZED  2024-03-06 18:16:04+00:00  ACTIVATED  PAN_NO   
3  48347356   UNORGANIZED  2024-02-10 14:59:20+00:00  ACTIVATED  PAN_NO   
4  49433305   UNORGANIZED  2024-04-11 14:17:55+00:00  ACTIVATED  PAN_NO   

                       name                     pan_createdAt pan_status  \
0  DHAARANI VELU RENUGADEVI  2024-06-15 13:44:18.424000+00:00   APPROVED   
1                 KAMRUDDIN         2024-01-09 07:55:06+00:00   APPROVED   
2                 MOHD AZAM  2024-03-06 12:53:41.360000+00:00   APPROVED   
3                 SANA BANO  2024-02-10 09:34:53.174000+00:00   APPROVED   
4          DHIRAJ KUMAR RAY  2024-04-11 08:54:19.945000+00:00   APPROVED   

       beneficiary_name bank_st

## name match function 

In [66]:
import pandas as pd

def process_name_matching_from_names(name1, name2):
    """
    Process name matching through multiple layers with given name1 and name2.
    Returns a DataFrame with layer scores and final prediction.
    """
    # Define thresholds
    threshold = 0.80
    threshold1 = 0.65

    # Create a DataFrame with the given names
    data = {
        'name1': [name1],
        'name2': [name2],
    }
    df = pd.DataFrame(data)

    # First Layer: Fuzzy Similarity
    df["First_Layer_Score"] = df.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df["First_Layer_Pass"] = df["First_Layer_Score"] >= threshold

    # If the first layer passes, set prediction and return the DataFrame
    if df["First_Layer_Pass"].iloc[0]:
        df["Prediction"] = True
        return df

    # Second Layer: Name Match
    df["Second_Layer_Score"] = df.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    df["Second_Layer_Pass"] = df["Second_Layer_Score"] >= threshold1

    # If the second layer passes, set prediction and return the DataFrame
    if df["Second_Layer_Pass"].iloc[0]:
        df["Prediction"] = True
        return df

    # Third Layer: Handle False Cases
    df["Third_Layer_Pass"] = df.apply(process_false_cases, axis=1)

    # Final Prediction
    df["Prediction"] = df["Third_Layer_Pass"]

    return df


if __name__ == "__main__":
    name1 = "Sara Trader"
    name2 = "Sara Tendulkar"
    df_result = process_name_matching_from_names(name1, name2)
    print("Result DataFrame with layer results and prediction:")
    print(df_result)


Result DataFrame with layer results and prediction:
         name1           name2  First_Layer_Score  First_Layer_Pass  \
0  Sara Trader  Sara Tendulkar               0.64             False   

   Second_Layer_Score  Second_Layer_Pass  Third_Layer_Pass  Prediction  
0            0.571201              False             False       False  


In [12]:
import pandas as pd

def process_name_matching_from_names(name1, name2):
    threshold = 0.80

    # Initial data setup
    data = {
        'name1': [name1],
        'name2': [name2],
        # 'label': [0],  # Assuming 'label' is set to 1 as specified
    }
    df = pd.DataFrame(data)

    # First Layer - Fuzzy Similarity
    df["First_Layer_Score"] = df.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
    df["First_Layer_Pass"] = df["First_Layer_Score"] >= threshold

    # Second Layer - Name Match
    name_match_result = df.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)
    
    # Check if name_match_result has multiple columns
    if not name_match_result.empty and isinstance(name_match_result.iloc[0], pd.Series):
        for i, col_name in enumerate(name_match_result.iloc[0].index):
            df[f"Second_Layer_Score_{col_name}"] = name_match_result.apply(lambda x: x[i])
        df["Second_Layer_Score"] = name_match_result.apply(lambda x: x[0])
    else:
        df["Second_Layer_Score"] = name_match_result

    df["Second_Layer_Pass"] = df["Second_Layer_Score"] >= threshold

    # Third Layer - Initial Matching
    df["Third_Layer_Score"] = df.apply(lambda x: check_initial_similarity(x['name1'], x['name2']), axis=1)

    # Initialize Prediction column with False
    df["Prediction"] = False

    # Initialize fuzzy_SS column with None for all rows
    df["fuzzy_SS"] = None

    # Update Prediction for rows that pass each layer
    df.loc[df["First_Layer_Pass"], "Prediction"] = True
    df.loc[df.get("Second_Layer_Pass", pd.Series([False] * len(df))), "Prediction"] = True
    df.loc[df.get("Third_Layer_Pass", pd.Series([False] * len(df))), "Prediction"] = True

    # Additional processing for rows failing all layers
    failed_cases = df[(df['label'] == 0) & (df['Prediction'] == True)]

    # Run additional fuzzy similarity checks on failed cases
    for index, row in failed_cases.iterrows():
        name1 = row['name1']
        name2 = row['name2']
        fuzzy_SS = calculate_fuzzy_similarity_processed(name1, name2)

        # Set fuzzy_SS for failed cases
        df.at[index, 'fuzzy_SS'] = fuzzy_SS

        # Determine fuzzy_flag based on threshold and keywords
        fuzzy_flag = 1 if fuzzy_SS >= threshold else 0
        if fuzzy_flag == 1:
            fuzzy_flag = check_keywords_in_names(name1, name2)

        prediction_value = True if fuzzy_flag == 1 else False

        # Update failed cases directly in Prediction
        df.at[index, 'Prediction'] = prediction_value

    return df

if __name__ == "__main__":
    # name1 = "DHANUSH MUKESH PATEL"
    # name2 = "C M INFOSYS"
    name1 = "Sara Trader"
    name2 = "Sara"
    df_combined = process_name_matching_from_names(name1, name2)
    print("Final DataFrame with layer results, fuzzy_SS, and updated predictions:\n", df_combined)


Final DataFrame with layer results, fuzzy_SS, and updated predictions:
          name1 name2  label  First_Layer_Score  First_Layer_Pass  \
0  Sara Trader  Sara      0              0.765             False   

   Second_Layer_Score  Second_Layer_Pass  Third_Layer_Score  Prediction  \
0            0.812393               True                1.0       False   

  fuzzy_SS  
0    0.765  


In [13]:
df_combined

Unnamed: 0,name1,name2,label,First_Layer_Score,First_Layer_Pass,Second_Layer_Score,Second_Layer_Pass,Third_Layer_Score,Prediction,fuzzy_SS
0,Sara Trader,Sara,0,0.765,False,0.812393,True,1.0,False,0.765
