<a href="https://colab.research.google.com/github/searchsolved/DeepImageSearch/blob/main/bert-semantic-interlinker/python-source-pairwise-matching/BERT_Semantic_Interlinker_11th_December_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Interlinker - Pairwise Matching - 11th December 2023
### by @LeeFootSEO | https://leefoot.co.uk

## How to use:
1.  Upload a crawl file Screaming Frog. The only two mandatory columns are Address and H1-1. (You can use other crawlers / URLs lists, as long as they contain a H1 and are named as above).
2.  Remember to choose a GPU runtime (Runtime > Change Runtime Type) or be prepared for a LONG wait!
3. This script uses pair-wise matching, essentially it matches row by row against the entire dataset. The workload will increase exponentially the larger the input file.
4. Reach out if you need this running as a managed service either via my website or hello@leefoot.co.uk

In [1]:
!pip install sentence_transformers
!pip install tqdm

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m61.4/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_trans

In [13]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from google.colab import files
import ipywidgets as widgets
from IPython.display import display
import warnings
from termcolor import colored

# Use the drop down menu below to choose the desired Sentence Transformer.
The default is 'all-MiniLM-L6-v2' which is a good balance between speed and performance.

In [3]:
models_with_scores_and_speed = {
    'all-MiniLM-L6-v2': {'score': '58.80', 'speed': '14200'},  # good balance of semantic score and speed
    'all-mpnet-base-v2': {'score': '63.30', 'speed': '2800'},
    'multi-qa-mpnet-base-dot-v1': {'score': '62.18', 'speed': '2800'},
    'all-distilroberta-v1': {'score': '59.84', 'speed': '4000'},
    'all-MiniLM-L12-v2': {'score': '59.76', 'speed': '7500'},
    'multi-qa-distilbert-cos-v1': {'score': '59.41', 'speed': '4000'},
    'multi-qa-MiniLM-L6-cos-v1': {'score': '58.08', 'speed': '14200'},
    'paraphrase-multilingual-mpnet-base-v2': {'score': '53.75', 'speed': '2500'},
    'paraphrase-albert-small-v2': {'score': '52.25', 'speed': '5000'},
    'paraphrase-multilingual-MiniLM-L12-v2': {'score': '51.72', 'speed': '7500'},
    'paraphrase-MiniLM-L3-v2': {'score': '50.74', 'speed': '19000'},
    'distiluse-base-multilingual-cased-v1': {'score': '45.59', 'speed': '4000'},
    'distiluse-base-multilingual-cased-v2': {'score': '43.77', 'speed': '4000'}
}

# Create a dropdown with models, their average performance scores, and speed
model_dropdown = widgets.Dropdown(
    options=[(f"{model} (Score: {details['score']}, Speed: {details['speed']})", model) for model, details in models_with_scores_and_speed.items()],
    description='Model:',
)

display(model_dropdown)

Dropdown(description='Model:', options=(('all-MiniLM-L6-v2 (Score: 58.80, Speed: 14200)', 'all-MiniLM-L6-v2'),…

# Set the similarity score cutoff and the maximum number of suggestions per page using the slider below.

In [None]:
# Slider for Minimum Similarity (as a percentage)
min_similarity_slider = widgets.FloatSlider(
    value=80,  # Default value in percentage
    min=0,     # Minimum value
    max=100,   # Maximum value
    step=1,    # Step size
    description='Min Similarity (%):',
    style={'description_width': 'initial'},
    layout={'width': '50%'}
)

# Slider for Maximum Suggestions Per Page
max_suggestions_slider = widgets.IntSlider(
    value=10,  # Default value
    min=1,     # Minimum value
    max=50,    # Maximum value
    step=1,    # Step size
    description='Max Suggestions/Page:',
    style={'description_width': 'initial'},
    layout={'width': '50%'}
)

display(min_similarity_slider, max_suggestions_slider)

In [None]:
MIN_SIMILARITY = min_similarity_slider.value / 100.0  # Convert percentage to decimal
MAX_SUGGESTIONS_PER_PAGE = max_suggestions_slider.value

In [14]:
# Automatically detect CUDA
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

if DEVICE == 'cpu':
    # Prints a warning message in red
    warning_message = "Warning: CUDA is not available. The script will run on the CPU, which may be much slower."
    print(colored(warning_message, 'red'))
    warnings.warn(warning_message)

print(f"Using device: {DEVICE}")

Using device: cuda


In [5]:
uploaded = files.upload()
input_filename = next(iter(uploaded))

Saving live_demo.csv to live_demo.csv


# Function Definitions

In [12]:
def read_and_clean_data(filepath):
    """
    Reads and cleans a DataFrame from a specified CSV file.

    This function reads a CSV file into a DataFrame, retains rows where the 'H1-1' column is not NaN,
    and removes rows where the 'H1-1' column starts with "All".

    Args:
    filepath (str): The file path of the CSV file to be read.

    Returns:
    DataFrame: A cleaned DataFrame.
    """
    df = pd.read_csv(filepath, encoding="utf-8")
    df = df[df["H1-1"].notna()]
    df = df[~df["H1-1"].str.contains("^All ", na=False, regex=True)]
    return df


def precompute_embeddings(df):
    """
    Computes embeddings for text data in a DataFrame using a specified transformer model.

    This function encodes the 'H1-1' column of a DataFrame using a transformer model
    specified in the global model_dropdown variable.

    Args:
    df (DataFrame): The DataFrame containing the text data in its 'H1-1' column.

    Returns:
    Tuple: A tuple containing the computed embeddings and the list of original texts.
    """
    embedding_model = SentenceTransformer(model_dropdown.value, device=DEVICE)
    to_list = list(df['H1-1'])
    to_embeddings = embedding_model.encode(to_list)
    return to_embeddings, to_list


def find_matches(from_list, to_list, to_embeddings, embedding_model):
    """
    Finds matches for each item in the from_list against the to_list based on cosine similarity.

    This function computes the cosine similarity between the embeddings of each item in from_list
    and all items in to_list, then selects matches based on a minimum similarity threshold.

    Args:
    from_list (list): A list of strings to find matches for.
    to_list (list): A list of strings to match against.
    to_embeddings (ndarray): The precomputed embeddings for the to_list.
    embedding_model (SentenceTransformer): The transformer model used for generating embeddings.

    Returns:
    DataFrame: A DataFrame with columns 'From', 'To', and 'Similarity' for each match found.
    """
    dfs = []
    with tqdm(total=len(from_list), desc="Finding Matches") as pbar:
        for kw in from_list:
            kw_embedding = embedding_model.encode([kw])
            similarities = cosine_similarity(kw_embedding, to_embeddings)[0]
            matches = np.where(similarities >= MIN_SIMILARITY)[0]
            matches = matches[similarities[matches].argsort()[::-1]]
            if len(matches) > 0:
                match_indices = matches[:MAX_SUGGESTIONS_PER_PAGE]
                df = pd.DataFrame({
                    'From': [kw] * len(match_indices),
                    'To': [to_list[j] for j in match_indices],
                    'Similarity': [similarities[j] for j in match_indices]
                })
                dfs.append(df)
            pbar.update(1)
    return pd.concat(dfs) if dfs else pd.DataFrame()


def merge_url_data(df_final, df_h1_urls):
    """
    Merges URL data into the final DataFrame.

    This function adds 'Source URL' and 'Destination URL' columns to the final DataFrame
    by merging with the df_h1_urls DataFrame based on the 'From' and 'To' columns.

    Args:
    df_final (DataFrame): The DataFrame containing the matching results.
    df_h1_urls (DataFrame): The DataFrame containing the URL data.

    Returns:
    DataFrame: The merged DataFrame with added URL columns.
    """
    df_final = pd.merge(df_final, df_h1_urls, left_on="From", right_on="H1-1", how="left")
    df_final = df_final.rename(columns={"Address": "Source URL"})
    del df_final['H1-1']
    df_final = pd.merge(df_final, df_h1_urls, left_on="To", right_on="H1-1", how="left")
    df_final = df_final.rename(columns={"Address": "Destination URL"})
    del df_final['H1-1']
    return df_final


def process_final_df(df_final):
    """
    Processes the final DataFrame to format and filter the data.

    This function removes duplicates, sorts, groups, and filters the DataFrame based on
    specified criteria such as similarity threshold and maximum suggestions per page.

    Args:
    df_final (DataFrame): The DataFrame to be processed.

    Returns:
    DataFrame: The processed DataFrame.
    """
    df_final.drop_duplicates(subset=["Source URL", "Destination URL"], keep="first", inplace=True)
    df_final = df_final.rename(columns={"From": "Source H1", "To": "Destination H1"})
    df_final = df_final[["Source H1", "Destination H1", "Similarity", "Source URL", "Destination URL"]]
    df_final.sort_values(["Source H1", "Similarity"], ascending=[True, False], inplace=True)
    df_final = df_final.groupby(['Source H1']).head(MAX_SUGGESTIONS_PER_PAGE)
    df_final = df_final[df_final.Similarity > MIN_SIMILARITY]
    df_final['Match'] = df_final['Source H1'] == df_final['Destination H1']
    df_final = df_final[df_final.Match == False]
    del df_final['Match']
    df_final['Similarity'] = df_final['Similarity'].round(2)
    return df_final

# Find Matches
## Remember to Enable GPU Processing or this will take an enternity to complete!

In [None]:
# Read and clean data
df = read_and_clean_data(input_filename)

# Extract relevant data for matching
df_h1_urls = df[['Address', 'H1-1']]
from_list = list(df['H1-1'])  # List of items to find matches for

# Precompute embeddings for the 'to' list
to_embeddings, to_list = precompute_embeddings(df)

# Initialize the Sentence Transformer Model
embedding_model = SentenceTransformer(model_dropdown.value, device=DEVICE)

# Now call the find_matches function
df_matches = find_matches(from_list, to_list, to_embeddings, embedding_model)

In [11]:
# Merge URL data and process the final DataFrame
df_final = merge_url_data(df_matches, df_h1_urls)
df_final_processed = process_final_df(df_final)

# Save and download the final processed DataFrame
output_filename = 'bert_clustered_results.csv'
df_final_processed.to_csv(output_filename, index=False)
files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>