Landmark Similarity

In [1]:
%pip install pandas requests sentence-transformers hdbscan umap-learn scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import requests
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import time

df = pd.read_csv("output_with_geocoding.csv")
df = df.head(10)  # Process only first 10 rows for testing

API_KEY ="API_KEY"  # replace this 

def get_landmarks(lat, lon, radius=50):
    url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    params = {
        "location": f"{lat},{lon}",
        "radius": radius,
        "key": API_KEY
    }
    try:
        response = requests.get(url, params=params)
        data = response.json()
        return [place["name"] for place in data.get("results", [])]
    except Exception as e:
        print(f"Error: {e}")
        return []

tqdm.pandas()
df["orig_landmarks"] = df.progress_apply(lambda row: get_landmarks(row["latitude"], row["longitude"]), axis=1)
df["rev_landmarks"] = df.progress_apply(lambda row: get_landmarks(row["new_latitude"], row["new_longitude"]), axis=1)

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def compute_similarity(orig_list, rev_list):
    if not orig_list or not rev_list:
        return 0.0
    try:
        # Join lists to make strings
        orig_text = ", ".join(orig_list)
        rev_text = ", ".join(rev_list)

        emb1 = model.encode(orig_text, convert_to_tensor=True)
        emb2 = model.encode(rev_text, convert_to_tensor=True)

        sim_score = util.pytorch_cos_sim(emb1, emb2).item()
        return sim_score
    except Exception as e:
        print(f"Error: {e}")
        return 0.0

df["landmark_similarity"] = df.progress_apply(
    lambda row: compute_similarity(row["orig_landmarks"], row["rev_landmarks"]), axis=1
)

high_sim_matches = df[df["landmark_similarity"] > 0.75]
print(f"✔️ High similarity pairs (landmarks): {len(high_sim_matches)} out of {len(df)}")

df.to_csv("landmark_similarity_results_sample10.csv", index=False)


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:05<00:00,  1.69it/s]
100%|██████████| 10/10 [00:05<00:00,  1.92it/s]
100%|██████████| 10/10 [00:03<00:00,  2.69it/s]

✔️ High similarity pairs (landmarks): 7 out of 10





In [None]:
import pandas as pd
import requests
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import time

df = pd.read_csv("output_with_geocoding.csv")
API_KEY = "API_KEY"  # replace this


def get_landmarks(lat, lon, radius=50, retries=3):
    url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    params = {
        "location": f"{lat},{lon}",
        "radius": radius,
        "key": API_KEY
    }

    for attempt in range(retries):
        try:
            response = requests.get(url, params=params, timeout=10)
            if response.status_code == 200:
                data = response.json()
                if "results" in data:
                    return [place["name"] for place in data["results"]]
            else:
                print(f"Status code: {response.status_code}, sleeping...")
        except Exception as e:
            print(f"API error: {e}")
        time.sleep(2)  # Retry delay
    return []


tqdm.pandas(desc="Fetching Landmarks")
df["orig_landmarks"] = df.progress_apply(lambda row: get_landmarks(row["latitude"], row["longitude"]), axis=1)
df["rev_landmarks"] = df.progress_apply(lambda row: get_landmarks(row["new_latitude"], row["new_longitude"]), axis=1)


model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def compute_similarity(orig_list, rev_list):
    if not orig_list or not rev_list:
        return 0.0
    try:
        emb1 = model.encode(", ".join(orig_list), convert_to_tensor=True)
        emb2 = model.encode(", ".join(rev_list), convert_to_tensor=True)
        return util.pytorch_cos_sim(emb1, emb2).item()
    except Exception as e:
        print(f"Encoding error: {e}")
        return 0.0

df["landmark_similarity"] = df.progress_apply(
    lambda row: compute_similarity(row["orig_landmarks"], row["rev_landmarks"]), axis=1
)

high_sim_matches = df[df["landmark_similarity"] > 0.75]
print(f"✔️ High similarity matches: {len(high_sim_matches)} / {len(df)}")

# Save Results
df.to_csv("landmark_similarity_results.csv", index=False)
print("✅ Results saved to landmark_similarity_results.csv")

In [None]:
import pandas as pd
import requests
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import time


df = pd.read_csv("output_with_geocoding.csv")
API_KEY = "API_KEY"  # replace this


def get_landmarks(lat, lon, radius=50):
    url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    params = {
        "location": f"{lat},{lon}",
        "radius": radius,
        "key": API_KEY
    }
    try:
        response = requests.get(url, params=params)
        data = response.json()
        return [place["name"] for place in data.get("results", [])]
    except Exception as e:
        print(f"Error: {e}")
        return []

df["orig_landmarks"] = df.apply(lambda row: get_landmarks(row["latitude"], row["longitude"]), axis=1)
df["rev_landmarks"] = df.apply(lambda row: get_landmarks(row["new_latitude"], row["new_longitude"]), axis=1)

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def compute_similarity(orig_list, rev_list):
    if not orig_list or not rev_list:
        return 0.0
    try:
        # Join lists to make strings
        orig_text = ", ".join(orig_list)
        rev_text = ", ".join(rev_list)

        emb1 = model.encode(orig_text, convert_to_tensor=True)
        emb2 = model.encode(rev_text, convert_to_tensor=True)

        sim_score = util.pytorch_cos_sim(emb1, emb2).item()
        return sim_score
    except Exception as e:
        print(f"Error: {e}")
        return 0.0

df["landmark_similarity"] = df.progress_apply(lambda row: compute_similarity(row["orig_landmarks"], row["rev_landmarks"]), axis=1)

high_sim_matches = df[df["landmark_similarity"] > 0.75]
print(f"✔️ High similarity pairs (landmarks): {len(high_sim_matches)} out of {len(df)}")

df.to_csv("landmark_similarity_results.csv", index=False)