## on 20K data

In [84]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split


In [85]:
df=pd.read_csv("layer2score.csv")

In [86]:
df.shape

(12557, 30)

In [90]:
df[['embedding_similarity', 'phonetic_similarity', 
                    'jaccard_similarity','labels']].head()

Unnamed: 0,embedding_similarity,phonetic_similarity,jaccard_similarity,labels
0,0.358638,0.5,0.555556,0
1,0.168768,0.5,0.166667,0
2,0.512135,0.866667,0.666667,0
3,0.066743,0.5,0.3,0
4,-0.02687,0.883333,0.428571,0


In [91]:
df['labels'].value_counts()

labels
0    10147
1     2410
Name: count, dtype: int64

In [107]:
numeric_features = ['embedding_similarity', 'phonetic_similarity', 
                    'jaccard_similarity']

X = df[numeric_features]  # Features
y = df['labels']  # The target variable

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23456)


In [156]:
from sklearn.ensemble import RandomForestClassifier

feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(n_estimators=100,
    criterion='gini',
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=10,
    min_weight_fraction_leaf=0.0,
    max_features='sqrt',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=True,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
    monotonic_cst=None,)
forest.fit(X_train, y_train)

In [157]:
importances = forest.feature_importances_

In [158]:
importances

array([0.28646091, 0.12301063, 0.59052846])

In [159]:
forest_importances = pd.Series(importances, index=['embedding_similarity', 'phonetic_similarity', 'jaccard_similarity'])


In [160]:
forest_importances

embedding_similarity    0.286461
phonetic_similarity     0.123011
jaccard_similarity      0.590528
dtype: float64

In [161]:
from sklearn.metrics import roc_auc_score  # Import roc_auc_score


In [162]:


# Predict probabilities for the positive class (y=1) on the test set
y_test_pred_prob = forest.predict_proba(X_test[numeric_features])[:, 1]
y_train_pred_prob = forest.predict_proba(X_train[numeric_features])[:, 1]

# Calculate AUC (Area Under the ROC Curve)
auc_test = roc_auc_score(y_test, y_test_pred_prob)
auc_train = roc_auc_score(y_train, y_train_pred_prob)

print('train',auc_train)
print('test',auc_test)
print('overfit',auc_train-auc_test)

train 0.965315341994219
test 0.9620298492625795
overfit 0.003285492731639472


In [163]:
0.9653808013970918-0.9620223937246963

0.0033584076723954803

In [133]:
forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 10,
 'min_samples_split': 20,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [67]:
df1=pd.read_csv("second_layer_score.csv")

In [68]:
df1['labels'].value_counts()/df1.shape[0]

labels
0    0.721572
1    0.278428
Name: count, dtype: float64

In [69]:
df1.shape

(7176, 30)

In [70]:
df1.columns

Index(['id', 'merchant_type', 'created_at', 'status', 'docType', 'name',
       'pan_createdAt', 'pan_status', 'beneficiary_name', 'bank_status',
       'account_number', 'szFlag', 'hvFlag', 'name1', 'name2', 'dsScore',
       'flFlag', 'dsFlag', 'szScore', 'flScore', 'SCORE', 'labels',
       'First_Layer_Score', 'First_Layer_Pass', 'Prediction',
       'embedding_similarity', 'levenshtein_similarity', 'phonetic_similarity',
       'jaccard_similarity', 'fuzzy_similarity'],
      dtype='object')

In [71]:

numeric_features = ['embedding_similarity', 'phonetic_similarity', 
                    'jaccard_similarity']

# Predict probabilities for the positive class (y=1) on the test set
pred_prob_df1 = forest.predict_proba(df1[numeric_features])[:, 1]
# y_train_pred_prob = forest.predict_proba(df1[numeric_features])[:, 1]

# Calculate AUC (Area Under the ROC Curve)
auc_df1 = roc_auc_score(df1['labels'], pred_prob_df1)
# auc_train = roc_auc_score(y_train, y_train_pred_prob)

print('Validation',auc_df1)
# print('test',auc_test)

Validation 0.9730963099058889


In [72]:
1-0.9730963099058889

0.026903690094111088

## to calcte the score column for layer 2 

In [None]:
 # return {
 #        "embedding_similarity": embedding_similarity,
 #        "levenshtein_similarity": levenshtein_similarity,
 #        "phonetic_similarity": phonetic_similarity,
 #        "jaccard_similarity": jaccard_similarity,
 #        "fuzzy_similarity": fuzzy_similarity
 #    }

In [None]:
# def process_name_matching(file_path):
#     # Load data for first layer
#     df_layer1 = pd.read_csv(file_path)
#     threshold = 0.80
#     threshold1 = 0.65

#     # First Layer: Calculate Fuzzy Similarity
#     df_layer1["First_Layer_Score"] = df_layer1.apply(lambda x: calculate_fuzzy_similarity(x['name1'], x['name2']), axis=1)
#     df_layer1["First_Layer_Pass"] = df_layer1["First_Layer_Score"] >= threshold
#     df_layer1["Prediction"] = df_layer1["First_Layer_Pass"]
    
#     # Save first layer result (including similarity scores)
#     df_layer1.to_csv("first_layer_results.csv", index=False)

#     # Second Layer: Filter out failed predictions from the first layer
#     df_layer2 = df_layer1[df_layer1["Prediction"] == False]  # Rows where Prediction == False

#     # Calculate similarity scores for Layer 2 cases using name_match function
#     similarity_scores = df_layer2.apply(lambda x: name_match(x['name1'], x['name2']), axis=1)

#     # Flatten the similarity scores and merge them into the DataFrame
#     df_layer2["embedding_similarity"] = similarity_scores.apply(lambda x: x["embedding_similarity"])
#     df_layer2["levenshtein_similarity"] = similarity_scores.apply(lambda x: x["levenshtein_similarity"])
#     df_layer2["phonetic_similarity"] = similarity_scores.apply(lambda x: x["phonetic_similarity"])
#     df_layer2["jaccard_similarity"] = similarity_scores.apply(lambda x: x["jaccard_similarity"])
#     df_layer2["fuzzy_similarity"] = similarity_scores.apply(lambda x: x["fuzzy_similarity"])

    
#     # Save second layer result (including all similarity scores)
#     df_layer2.to_csv("second_layer_results.csv", index=False)

#     # Combine both layers results into a final DataFrame for cases passing the second layer
#     # Combine both layer results (including predictions from layer 1 and similarity scores from layer 2)
#     df_combined = pd.concat([df_layer1, df_layer2])

#     # Save the final combined results (including similarity scores from both layers)
#     df_combined.to_csv("final_results_with_similarity_scores.csv", index=False)

#     return df_combined

# if __name__ == "__main__":
#     # Assuming your file is named "combined_data_10k.csv"
#     combined_data = process_name_matching("combined_data_10k.csv")
#     print("Process completed. The results are saved in 'final_results_with_similarity_scores.csv'.")
