<a href="https://colab.research.google.com/github/simodepth96/Backlink-Analysis/blob/main/Backlink_Quality_Assessment_using_Cosine_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers
!pip install -U transformers



In [None]:
#@title JINA but it's too long

import pandas as pd
import re
import numpy as np
from transformers import AutoTokenizer, AutoModel  # Fixed typo: "transdformers" -> "transformers"
import torch
from scipy.spatial.distance import cosine

# Load Excel file
file_path = "/content/backlink profile twinset.xlsx"
df = pd.read_excel(file_path)

# Define the columns of interest
url_columns = ['Referring page URL', 'Target URL']
for col in url_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Load Jina embedding model (using v2 for better compatibility)
try:
    tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")
    model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-en", trust_remote_code=True)
    print("Using Jina v2 model for better compatibility")
except:
    # Fallback to v4 if v2 is not available
    tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v4")
    model = AutoModel.from_pretrained("jinaai/jina-embeddings-v4", trust_remote_code=True)
    print("Using Jina v4 model")

# Function to tokenize URLs semantically
def tokenize_url(url):
    if pd.isna(url) or not isinstance(url, str):
        return []
    url = re.sub(r"https?://", "", url)
    tokens = re.split(r"[\/\.\-\?\=\_\&]+", url)
    return [t.lower() for t in tokens if t]

# Function to compute average embedding from token list
def get_average_embedding(tokens):
    if not tokens:
        return np.zeros(768)  # Use standard embedding size

    # Join tokens into a single string for embedding
    text = " ".join(tokens)

    with torch.no_grad():
        # Tokenize the text
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)

        # Forward pass - handle both v2 and v4 models
        try:
            # Try v4 approach first
            outputs = model(**inputs, task_label="text-matching")
        except:
            # Fallback to v2 approach
            outputs = model(**inputs)

        # Get embeddings - try different possible attributes
        if hasattr(outputs, 'embeddings'):
            embeddings = outputs.embeddings
        elif hasattr(outputs, 'last_hidden_state'):
            embeddings = outputs.last_hidden_state
        elif hasattr(outputs, 'pooler_output'):
            embeddings = outputs.pooler_output
        else:
            # If none of the above, try to get the first element if it's a tuple/list
            embeddings = outputs[0] if isinstance(outputs, (tuple, list)) else outputs

        # Mean pooling if we have sequence embeddings
        if len(embeddings.shape) == 3:  # [batch_size, seq_len, hidden_size]
            embeddings = embeddings.mean(dim=1)

        # Convert to numpy and squeeze
        embeddings = embeddings.squeeze().numpy()
        return embeddings

# Function to compute cosine similarity between two URLs
def compute_token_based_similarity(ref_url, tgt_url):
    ref_tokens = tokenize_url(ref_url)
    tgt_tokens = tokenize_url(tgt_url)
    ref_vec = get_average_embedding(ref_tokens)
    tgt_vec = get_average_embedding(tgt_tokens)

    if np.all(ref_vec == 0) or np.all(tgt_vec == 0):
        return np.nan

    return 1 - cosine(ref_vec, tgt_vec)

# Compute similarity for each row
print("Computing cosine similarities...")
df['Cosine Similarity'] = df.apply(
    lambda row: compute_token_based_similarity(row['Referring page URL'], row['Target URL']),
    axis=1
)

# Optional: Round for readability
df['Cosine Similarity'] = df['Cosine Similarity'].round(3)

# Preview result
print("\nSample results:")
print(df[['Referring page URL', 'Target URL', 'Cosine Similarity']].head())

# Save results to a new Excel file
output_path = "/content/backlink_profile_with_similarity.xlsx"
df.to_excel(output_path, index=False)
print(f"\nResults saved to: {output_path}")

# Display some statistics
print(f"\nStatistics:")
print(f"Total rows processed: {len(df)}")
print(f"Valid similarities: {df['Cosine Similarity'].notna().sum()}")
print(f"Average similarity: {df['Cosine Similarity'].mean():.3f}")
print(f"Min similarity: {df['Cosine Similarity'].min():.3f}")
print(f"Max similarity: {df['Cosine Similarity'].max():.3f}")

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

Using Jina v2 model for better compatibility
Computing cosine similarities...

Sample results:
                                  Referring page URL  \
0                                https://twinset.ru/   
1                            https://metapic.com/it/   
2                        https://azora.store/brands/   
3  https://www.pleasedontbuy.com/it-it/diciottesimo/   
4      https://www.modiseh.com/blog/new-short-manto/   

                                          Target URL  Cosine Similarity  
0                     https://www.twinset.com/en-pt/              0.827  
1                           https://www.twinset.com/              0.759  
2                           https://www.twinset.com/              0.674  
3                     https://www.twinset.com/it-it/              0.801  
4  https://www.twinset.com/en-lu/search?cgid=abbi...              0.749  

Results saved to: /content/backlink_profile_with_similarity.xlsx

Statistics:
Total rows processed: 2783
Valid similarities

In [None]:
#@title Option 2- all-MiniLM-L6-v2
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Load Excel file
file_path = "/content/backlink profile twinset.xlsx"
df = pd.read_excel(file_path)

# Define the columns of interest
url_columns = ['Referring page URL', 'Target URL']
for col in url_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Initialize Sentence-Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2') #possibly the most robust, lightweight and therefore convenient model of all

# Function to tokenize URLs semantically
def tokenize_url(url):
    if pd.isna(url) or not isinstance(url, str):
        return []
    # Remove protocol
    url = re.sub(r"https?://", "", url)
    # Split by separators
    tokens = re.split(r"[\/\.\-\?\=\_\&]+", url)
    return [t.lower() for t in tokens if t]

# Function to get average embedding for a list of tokens
def get_average_embedding(tokens):
    if not tokens:
        return np.zeros(model.get_sentence_embedding_dimension())
    embeddings = model.encode(tokens)
    return np.mean(embeddings, axis=0)

# Function to compute semantic cosine similarity between two URLs
def compute_token_based_similarity(ref_url, tgt_url):
    ref_tokens = tokenize_url(ref_url)
    tgt_tokens = tokenize_url(tgt_url)
    ref_vec = get_average_embedding(ref_tokens)
    tgt_vec = get_average_embedding(tgt_tokens)
    if np.all(ref_vec == 0) or np.all(tgt_vec == 0):
        return np.nan
    return 1 - cosine(ref_vec, tgt_vec)

# Apply similarity computation to each row
df['Cosine Similarity'] = df.apply(
    lambda row: compute_token_based_similarity(row['Referring page URL'], row['Target URL']),
    axis=1
)

# Optional: Round for readability
df['Cosine Similarity'] = df['Cosine Similarity'].round(3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
df['Domain rating']=df['Domain rating'].astype(int).round(1)
df['Cosine Similarity']=df['Cosine Similarity'].astype(float).round(2)

# Show sample
df[['Referring page URL', 'Target URL','Domain rating', 'Cosine Similarity']].head()

# Save result
df.to_excel("semantic_url_similarity.xlsx", index=False)

In [None]:
df['Domain rating'] = df['Domain rating'].astype(int)  # Already in 0–100 scale

# Scale cosine similarity to 0–100 and convert to integer
df['Cosine Similarity'] = (df['Cosine Similarity'] * 100).round().astype(int)
df.head()

Unnamed: 0,Referring page URL,Referring page HTTP code,Target URL,Anchor,Domain rating,Referring domains,Linked domains,External links,Cosine Similarity
0,https://twinset.ru/,200,https://www.twinset.com/en-pt/,Португалия,26,106,6,38,83
1,https://metapic.com/it/,200,https://www.twinset.com/,Twinset,44,5,61,87,76
2,https://azora.store/brands/,200,https://www.twinset.com/,TWIN-SET,13,1,193,333,67
3,https://www.pleasedontbuy.com/it-it/diciottesimo/,200,https://www.twinset.com/it-it/,twinset.com,26,0,5,6,80
4,https://www.modiseh.com/blog/new-short-manto/,200,https://www.twinset.com/en-lu/search?cgid=abbi...,twinset,55,0,3,4,75


In [None]:
!pip install plotly
import plotly.express as px

fig = px.scatter(df, x='Cosine Similarity', y='Domain rating', title='Domain Rating vs Cosine Similarity')
fig.show()



In [None]:
# prompt: create a bar chart with the top 10 Referring Page URLs by Cosine Similarity score (ascending=False) from df using plotly

# Group by 'Referring page URL' and calculate the mean Cosine Similarity
df_agg = df.groupby('Referring page URL')['Cosine Similarity'].mean().reset_index()

# Sort by Cosine Similarity in descending order and get the top 10
top_10_referring_pages = df_agg.sort_values(by='Cosine Similarity', ascending=False).head(10)

# Create a bar chart
fig = px.bar(
    top_10_referring_pages,
    x='Cosine Similarity',
    y='Referring page URL',
    title='Top 10 Backlinks by Cosine Similarity'
)

# Customize the layout for better readability of x-axis labels
fig.update_layout(
    xaxis_tickangle=-45, # Angle the x-axis labels
    xaxis=dict(tickmode='auto', nticks=10) # Adjust tick mode and number
)

fig.show()

In [None]:
#@title Option2  all-mpnet-base-v2 (Most accuracy but tad slow)
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Load Excel file
file_path = "/content/backlink profile twinset.xlsx"
df = pd.read_excel(file_path)

# Define the columns of interest
url_columns = ['Referring page URL', 'Target URL']
for col in url_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Initialize Sentence-Transformer model
model = SentenceTransformer('all-mpnet-base-v2') #could it be the most accurate (though a bit slow)

# Function to tokenize URLs semantically
def tokenize_url(url):
    if pd.isna(url) or not isinstance(url, str):
        return []
    # Remove protocol
    url = re.sub(r"https?://", "", url)
    # Split by separators
    tokens = re.split(r"[\/\.\-\?\=\_\&]+", url)
    return [t.lower() for t in tokens if t]

# Function to get average embedding for a list of tokens
def get_average_embedding(tokens):
    if not tokens:
        return np.zeros(model.get_sentence_embedding_dimension())
    embeddings = model.encode(tokens)
    return np.mean(embeddings, axis=0)

# Function to compute semantic cosine similarity between two URLs
def compute_token_based_similarity(ref_url, tgt_url):
    ref_tokens = tokenize_url(ref_url)
    tgt_tokens = tokenize_url(tgt_url)
    ref_vec = get_average_embedding(ref_tokens)
    tgt_vec = get_average_embedding(tgt_tokens)
    if np.all(ref_vec == 0) or np.all(tgt_vec == 0):
        return np.nan
    return 1 - cosine(ref_vec, tgt_vec)

# Apply similarity computation to each row
df['URL Cosine Similarity'] = df.apply(
    lambda row: compute_token_based_similarity(row['Referring page URL'], row['Target URL']),
    axis=1
)

# Optional: Round for readability
df['URL Cosine Similarity'] = df['URL Cosine Similarity'].round(3)

# Sort by similarity descending
df_sorted = df.sort_values(by='URL Cosine Similarity', ascending=False)

# Save result
df_sorted.to_excel("semantic_url_similarity.xlsx", index=False)

# Show sample
df_sorted[['Referring page URL', 'Target URL', 'URL Cosine Similarity']].head()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
#@title Option 3 - paraphrase-mpnet-base-v2 (Very Slow and resource-intensive)
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Load Excel file
file_path = "/content/seodepths blk.xlsx"
df = pd.read_excel(file_path)

# Define the columns of interest
url_columns = ['Referring page URL', 'Target URL']
for col in url_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Initialize Sentence-Transformer model
model = SentenceTransformer('paraphrase-mpnet-base-v2') #possibly the most robust, lightweight and therefore convenient model of all

# Function to tokenize URLs semantically
def tokenize_url(url):
    if pd.isna(url) or not isinstance(url, str):
        return []
    # Remove protocol
    url = re.sub(r"https?://", "", url)
    # Split by separators
    tokens = re.split(r"[\/\.\-\?\=\_\&]+", url)
    return [t.lower() for t in tokens if t]

# Function to get average embedding for a list of tokens
def get_average_embedding(tokens):
    if not tokens:
        return np.zeros(model.get_sentence_embedding_dimension())
    embeddings = model.encode(tokens)
    return np.mean(embeddings, axis=0)

# Function to compute semantic cosine similarity between two URLs
def compute_token_based_similarity(ref_url, tgt_url):
    ref_tokens = tokenize_url(ref_url)
    tgt_tokens = tokenize_url(tgt_url)
    ref_vec = get_average_embedding(ref_tokens)
    tgt_vec = get_average_embedding(tgt_tokens)
    if np.all(ref_vec == 0) or np.all(tgt_vec == 0):
        return np.nan
    return 1 - cosine(ref_vec, tgt_vec)

# Apply similarity computation to each row
df['URL Cosine Similarity'] = df.apply(
    lambda row: compute_token_based_similarity(row['Referring page URL'], row['Target URL']),
    axis=1
)

# Optional: Round for readability
df['URL Cosine Similarity'] = df['URL Cosine Similarity'].round(3)

# Sort by similarity descending
df_sorted = df.sort_values(by='URL Cosine Similarity', ascending=False)

# Save result
df_sorted.to_excel("semantic_url_similarity.xlsx", index=False)

# Show sample
df_sorted[['Referring page URL', 'Target URL', 'URL Cosine Similarity']].head()


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,Referring page URL,Target URL,URL Cosine Similarity
53,https://www.modernostrategies.com/resources/te...,https://seodepths.com/seo-research/web-fonts-i...,0.835
33,https://askai.glarity.app/search/How-do-the-us...,https://seodepths.com/seo-research/search-engi...,0.831
21,https://news.seofomo.co/story-category/technic...,https://seodepths.com/seo-research/the-role-of...,0.821
30,https://www.jcchouinard.com/projects-and-resou...,https://seodepths.com/python-for-seo/canonical...,0.814
1,https://seotistics.kit.com/posts/my-seo-stack-...,https://seodepths.com/seo-research/robots-bloc...,0.807


In [None]:
#@title Option 5- distiluse-base-multilingual-cased-v2 (Lightweight multilingual universal encoder)
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Load Excel file
file_path = "/content/seodepths blk.xlsx"
df = pd.read_excel(file_path)

# Define the columns of interest
url_columns = ['Referring page URL', 'Target URL']
for col in url_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Initialize Sentence-Transformer model
model = SentenceTransformer('distiluse-base-multilingual-cased-v2') #Lightweight multilingual universal encoder

# Function to tokenize URLs semantically
def tokenize_url(url):
    if pd.isna(url) or not isinstance(url, str):
        return []
    # Remove protocol
    url = re.sub(r"https?://", "", url)
    # Split by separators
    tokens = re.split(r"[\/\.\-\?\=\_\&]+", url)
    return [t.lower() for t in tokens if t]

# Function to get average embedding for a list of tokens
def get_average_embedding(tokens):
    if not tokens:
        return np.zeros(model.get_sentence_embedding_dimension())
    embeddings = model.encode(tokens)
    return np.mean(embeddings, axis=0)

# Function to compute semantic cosine similarity between two URLs
def compute_token_based_similarity(ref_url, tgt_url):
    ref_tokens = tokenize_url(ref_url)
    tgt_tokens = tokenize_url(tgt_url)
    ref_vec = get_average_embedding(ref_tokens)
    tgt_vec = get_average_embedding(tgt_tokens)
    if np.all(ref_vec == 0) or np.all(tgt_vec == 0):
        return np.nan
    return 1 - cosine(ref_vec, tgt_vec)

# Apply similarity computation to each row
df['URL Cosine Similarity'] = df.apply(
    lambda row: compute_token_based_similarity(row['Referring page URL'], row['Target URL']),
    axis=1
)

# Optional: Round for readability
df['URL Cosine Similarity'] = df['URL Cosine Similarity'].round(3)

# Sort by similarity descending
df_sorted = df.sort_values(by='URL Cosine Similarity', ascending=False)

# Save result
df_sorted.to_excel("semantic_url_similarity.xlsx", index=False)

# Show sample
df_sorted[['Referring page URL', 'Target URL', 'URL Cosine Similarity']].head()


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Unnamed: 0,Referring page URL,Target URL,URL Cosine Similarity
33,https://askai.glarity.app/search/How-do-the-us...,https://seodepths.com/seo-research/search-engi...,0.943
43,https://www.seo-praktik.si/tedensko-azuriran-s...,https://seodepths.com/seo-research/nlp-seo-gui...,0.915
30,https://www.jcchouinard.com/projects-and-resou...,https://seodepths.com/python-for-seo/canonical...,0.898
1,https://seotistics.kit.com/posts/my-seo-stack-...,https://seodepths.com/seo-research/robots-bloc...,0.881
65,https://tridentseo.biz/pagination-best-seo-pra...,https://seodepths.com/seo-research/seo-paginat...,0.875


In [None]:
!pip install gensim



In [None]:
#@title Option 6 - Word2vec to create embeddings and cosine similarity (Token and n-gram-based approach and therefore it's lightweight and fast but not very accurate)

import pandas as pd
import re
from gensim.models import Word2Vec
from numpy import mean
from numpy.linalg import norm
import numpy as np

# Load Excel file
file_path = "/content/seodepths blk.xlsx"  # Update path if needed
df = pd.read_excel(file_path)

# Tokenize URL
def tokenize_url(url):
    if pd.isna(url) or not isinstance(url, str):
        return []
    url = re.sub(r"https?://", "", url)
    tokens = re.split(r"[\/\.\-\?\=\_\&]+", url)
    return [token.lower() for token in tokens if token]

# Only use relevant columns
url_columns = ['Referring page URL', 'Target URL']
token_lists = []

for col in url_columns:
    if col in df.columns:
        token_lists.extend(df[col].dropna().apply(tokenize_url).tolist())
    else:
        print(f"Warning: Column '{col}' not found.")

# Train Word2Vec model
model = Word2Vec(sentences=token_lists, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Get average embedding for a URL
def get_average_vector(tokens):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        return mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Cosine similarity function
def cosine_sim(v1, v2):
    if norm(v1) == 0 or norm(v2) == 0:
        return 0.0
    return float(np.dot(v1, v2) / (norm(v1) * norm(v2)))

# Compute cosine similarity row by row
similarities = []
for _, row in df.iterrows():
    ref_tokens = tokenize_url(row.get('Referring page URL'))
    tgt_tokens = tokenize_url(row.get('Target URL'))
    ref_vec = get_average_vector(ref_tokens)
    tgt_vec = get_average_vector(tgt_tokens)
    sim = cosine_sim(ref_vec, tgt_vec)
    similarities.append(sim)

# Add similarity column to dataframe
df['URL Cosine Similarity'] = similarities

df_sorted = df.sort_values(by='URL Cosine Similarity', ascending=False)

# Display top rows
df_sorted[['Referring page URL', 'Target URL', 'URL Cosine Similarity']].head()

# Optional: save to Excel or CSV
#df.to_excel("url_similarity_output.xlsx", index=False)


Unnamed: 0,Referring page URL,Target URL,URL Cosine Similarity
65,https://tridentseo.biz/pagination-best-seo-pra...,https://seodepths.com/seo-research/seo-paginat...,0.645092
43,https://www.seo-praktik.si/tedensko-azuriran-s...,https://seodepths.com/seo-research/nlp-seo-gui...,0.633962
30,https://www.jcchouinard.com/projects-and-resou...,https://seodepths.com/python-for-seo/canonical...,0.629569
66,https://wecanrankanything.com/tag/seo/page/3674/,https://seodepths.com/seo-research/nlp-seo-gui...,0.609712
1,https://seotistics.kit.com/posts/my-seo-stack-...,https://seodepths.com/seo-research/robots-bloc...,0.589976
