# Fuzzy matching for Indian Builtwith websites and Aberdeen names

In [1]:
# Import relevant libraries
import pandas as pd 
from thefuzz import fuzz
from thefuzz import process
import numpy as np
import json

In [2]:
# Read data of builtwith website
mult_builtwith_ind = pd.read_csv("../../Data/India/processed_data/new_ids_cases_for_fuzzy_builtwith_IND.csv")
mult_builtwith_ind.head(30)

Unnamed: 0,old_id,new_id,panjiva_raw_firm_name,builtwith_website,different_old_ids,n_url,n_total_old_ids,mode_url_count,share_url_mode,obs_has_mode,is_multimodal,unimodal,only_one_url_retrieved
0,IND121408,IND100726,Universal Imp. Exp. And Hospitality Private Li,hotelairlink.com,True,1,2,1,,1,1,0,0
1,IND201797,IND100726,Universal Exporters,universalexporter.in,True,1,2,1,,1,1,0,0
2,IND152668,IND100901,Aadhunik Industrie,aadhunikindustries.com,True,1,2,1,,1,1,0,0
3,IND24316,IND100901,Aadhunik Industries,adhunikindustries.com,True,1,2,1,,1,1,0,0
4,IND346129,IND10134,Naturs Basket,naturesbasket.co.in,True,1,2,1,,1,1,0,0
5,IND238891,IND10134,My Natures Basket,supermunchies.com,True,1,2,1,,1,1,0,0
6,IND78,IND101482,Skf India Ltd.,skf.com,True,1,2,1,,1,1,0,0
7,IND117759,IND101482,Skff (India) Private Ltd.,skff.in,True,1,2,1,,1,1,0,0
8,IND283742,IND101718,L And T Metro Rail Hyderabad Ltd.,ltmetro.com,True,1,2,1,,1,1,0,0
9,IND22370,IND101718,L&T Metro Rail (Hyderabad) Ltd.,ltmetro.in,True,1,2,1,,1,1,0,0


In [3]:
# Empty dictionary to store results 
scores_builtwith = {}
# Array with new IDs
new_ids = mult_builtwith_ind["new_id"].unique()
for new_id in new_ids: 
    df_filtered = mult_builtwith_ind[mult_builtwith_ind["new_id"] == new_id]
    scores_builtwith[new_id] = {}
    best_websites = []
    for old_id, raw_name in zip(df_filtered.old_id.to_numpy(), df_filtered.panjiva_raw_firm_name.to_numpy()):
        scores_builtwith[new_id][old_id] = {}
        scores_builtwith[new_id][old_id][raw_name] = {}  
        
        max_avg_score = 0
        best_website = None
        for website in df_filtered.builtwith_website.to_numpy():
            
            score_ratio = fuzz.ratio(raw_name, website)
            score_partial_ratio = fuzz.partial_ratio(raw_name, website)
            score_token_sort_ratio = fuzz.token_sort_ratio(raw_name, website)
            avg_score = (score_ratio + score_partial_ratio + score_token_sort_ratio) / 3
            sum_scores = (score_ratio + score_partial_ratio + score_token_sort_ratio)
            
            scores_builtwith[new_id][old_id][raw_name][website] = {
                "score_ratio": score_ratio,
                "score_partial_ratio": score_partial_ratio,
                "score_token_sort_ratio": score_token_sort_ratio,
                "avg_score": avg_score, 
                "sum_scores": sum_scores
            } 
            
            # Check if this website has the highest average score so far
            if avg_score > max_avg_score:
                max_avg_score = avg_score
                best_website = website
                
        # Store the best website and its score for the current old_id
        scores_builtwith[new_id][old_id]['best_website'] = {
            "website": best_website,
            "avg_score": max_avg_score
        }
        
        # Add the best website to the list 
        best_websites.append(best_website)
        
    if len(set(best_websites)) == 1:
        scores_builtwith[new_id]["share_best_website"] = 1     
    else:
        scores_builtwith[new_id]["share_best_website"] = 0     

In [4]:
print(json.dumps(scores_builtwith, indent=4, sort_keys=True))

{
    "IND100726": {
        "IND121408": {
            "Universal Imp. Exp. And Hospitality Private Li": {
                "hotelairlink.com": {
                    "avg_score": 29.0,
                    "score_partial_ratio": 38,
                    "score_ratio": 19,
                    "score_token_sort_ratio": 30
                },
                "universalexporter.in": {
                    "avg_score": 46.0,
                    "score_partial_ratio": 59,
                    "score_ratio": 45,
                    "score_token_sort_ratio": 34
                }
            },
            "best_website": {
                "avg_score": 46.0,
                "website": "universalexporter.in"
            }
        },
        "IND201797": {
            "Universal Exporters": {
                "hotelairlink.com": {
                    "avg_score": 31.0,
                    "score_partial_ratio": 36,
                    "score_ratio": 23,
                    "score_token_sort_ratio": 34


In [14]:
# Create an empty list to store data for DataFrame
data_list = []

# Iterate through the dictionary to extract required information
for new_id, old_ids in scores_builtwith.items():
    for old_id, contents in old_ids.items():
        if old_id == "share_best_website":
            continue
        for panjiva_raw_name, websites in contents.items():
            if panjiva_raw_name == "best_website":
                continue
            for website, scores in websites.items():
                data_list.append({
                    "new_id": new_id,
                    "old_id": old_id,
                    "panjiva_raw_name": panjiva_raw_name,
                    "website": website,
                    "avg_score": scores["avg_score"],
                    "score_partial_ratio": scores["score_partial_ratio"],
                    "score_ratio": scores["score_ratio"],
                    "score_token_sort_ratio": scores["score_token_sort_ratio"],
                    "website_highest_score": contents["best_website"]["website"],
                    "share_best_website": old_ids["share_best_website"]
                })

# Create a DataFrame from the list of dictionaries
df_builtwith_multimodal = pd.DataFrame(data_list)
df_builtwith_multimodal

Unnamed: 0,new_id,old_id,panjiva_raw_name,website,avg_score,score_partial_ratio,score_ratio,score_token_sort_ratio,website_highest_score,share_best_website
0,IND100726,IND121408,Universal Imp. Exp. And Hospitality Private Li,hotelairlink.com,29.000000,38,19,30,universalexporter.in,1
1,IND100726,IND121408,Universal Imp. Exp. And Hospitality Private Li,universalexporter.in,46.000000,59,45,34,universalexporter.in,1
2,IND100726,IND201797,Universal Exporters,hotelairlink.com,31.000000,36,23,34,universalexporter.in,1
3,IND100726,IND201797,Universal Exporters,universalexporter.in,70.333333,83,77,51,universalexporter.in,1
4,IND100901,IND152668,Aadhunik Industrie,aadhunikindustries.com,82.000000,86,75,85,adhunikindustries.com,1
...,...,...,...,...,...,...,...,...,...,...
2879,IND98715,IND26731,R.G.Apparel,rgapparel.in,57.666667,60,52,61,rgapparel.in,1
2880,IND99430,IND152330,Aarchem Corporatio,aarchem.com,61.333333,67,48,69,aarchem.com,0
2881,IND99430,IND152330,Aarchem Corporatio,archem.co.in,61.333333,64,53,67,aarchem.com,0
2882,IND99430,IND52519,Aarchem Corporation,aarchem.com,60.333333,67,47,67,archem.co.in,0


In [23]:
# Share of new ids sharing the best website
df_builtwith_multimodal.drop_duplicates("new_id").share_best_website.value_counts()/df_builtwith_multimodal.drop_duplicates("new_id").share_best_website.value_counts().sum()

1    0.758089
0    0.241911
Name: share_best_website, dtype: float64

In [33]:
67 + 48 + 69

184