In [1]:
!pip install pymongo sentence-transformers xgboost lightgbm

Collecting pymongo
  Downloading pymongo-4.13.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cubl

In [2]:

from pymongo import MongoClient
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import joblib


In [3]:
# ============================ #
#      2. Connect to MongoDB   #
# ============================ #

username = "salma"
password = "f3ekNjEhzec0cYkx"
cluster = "production.g8vjv.mongodb.net"
uri = f"mongodb+srv://{username}:{password}@{cluster}/?retryWrites=true&w=majority"

client = MongoClient(uri)
db = client["heroku_v801wdr2"]
collection = db["addresses"]

In [4]:


# ============================ #
#    3. Data Cleaning/Prep     #
# ============================ #

ABBREVIATION_MAP = {
    "st": "street", "rd": "road", "ave": "avenue", "blvd": "boulevard",
    "blk": "block", "bldg": "building", "apt": "apartment", "flr": "floor",
    "mohd": "mohammed", "bn": "bin", "hwy": "highway"
}

def normalize_text(text):
    words = text.lower().split()
    return " ".join([ABBREVIATION_MAP.get(word, word) for word in words])

def build_address(row):
    parts = [
        row.get("country", ""), row.get("area") or row.get("city", ""),
        f"block {row.get('block')}" if row.get("block") else "",
        row.get("street", ""), row.get("buildingNumber", ""),
        row.get("apartment", ""), row.get("floor", "")
    ]
    return normalize_text(" ".join([str(p) for p in parts if p]))

def get_lat_lon(loc):
    return loc.get("latitude"), loc.get("longitude") if isinstance(loc, dict) else (None, None)


In [5]:

# ========== Download and Clean Data ========= #
def load_data(collection, limit=100_000):
    cursor = collection.find({
        "deleted": False,
        "rawStreet": False,
        "location.latitude": {"$exists": True},
        "location.longitude": {"$exists": True}
    }, {
        "_id": 0, "country": 1, "area": 1, "city": 1, "block": 1,
        "street": 1, "buildingNumber": 1, "apartment": 1, "floor": 1,
        "location": 1
    }).limit(limit)

    df = pd.DataFrame(list(cursor))
    df["input_text"] = df.apply(build_address, axis=1)
    df[["latitude", "longitude"]] = df["location"].apply(lambda x: pd.Series(get_lat_lon(x)))
    df = df[["input_text", "latitude", "longitude"]].dropna().drop_duplicates()
    return df

df = load_data(collection)
print("📊 Data loaded:", df.shape)


📊 Data loaded: (77797, 3)


model_name = "all-MiniLM-L6-v2"  # ≈2x faster, still good quality


In [6]:
# 6. Sentence embeddings
model_name = "paraphrase-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

batch_size = 512
embeddings = []
for i in tqdm(range(0, len(df), batch_size)):
    batch = df["input_text"].iloc[i:i+batch_size].tolist()
    emb = embedder.encode(batch, batch_size=64, show_progress_bar=False)
    embeddings.append(emb)
X = np.vstack(embeddings)
y = df[["latitude", "longitude"]].values

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 152/152 [13:17<00:00,  5.25s/it]


In [7]:
# 7. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
#Haversine Evaluation Function

from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    return 6371000 * 2 * asin(sqrt(a))  # meters

def eval_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    distances = [haversine(a[0], a[1], b[0], b[1]) for a, b in zip(y_true, y_pred)]
    print(f"RMSE: {rmse:.5f}")
    print(f"MAE: {mae:.5f}")
    print(f"R²: {r2:.5f}")
    print(f"Mean distance: {np.mean(distances):.2f} meters")
    print(f"Median distance: {np.median(distances):.2f} meters")


#Training

In [9]:
xgb_model = MultiOutputRegressor(
    XGBRegressor(
        objective='reg:squarederror',
        n_estimators=300,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.8,
        n_jobs=-1,
        random_state=42
    )
)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print("🔷 XGBoost Evaluation:")
eval_model(y_test, y_pred)


🔷 XGBoost Evaluation:
RMSE: 0.05021
MAE: 0.01474
R²: 0.90221
Mean distance: 2401.12 meters
Median distance: 1385.76 meters


In [10]:
lgbm_model = MultiOutputRegressor(
    LGBMRegressor(
        n_estimators=300,
        num_leaves=40,
        learning_rate=0.05,
        random_state=42,
        n_jobs=-1
    )
)
lgbm_model.fit(X_train, y_train)
y_pred = lgbm_model.predict(X_test)
print("🟢 LightGBM Evaluation:")
eval_model(y_test, y_pred)




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.729008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 62237, number of used features: 384
[LightGBM] [Info] Start training from score 29.263305




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.832377 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 62237, number of used features: 384
[LightGBM] [Info] Start training from score 48.006480




🟢 LightGBM Evaluation:
RMSE: 0.05773
MAE: 0.01707
R²: 0.87090
Mean distance: 2781.10 meters
Median distance: 1696.46 meters


In [11]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train_scaled, dtype=torch.float32)
X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_t = torch.tensor(y_test_scaled, dtype=torch.float32)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=128, shuffle=True)

class GeoNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )
    def forward(self, x): return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GeoNN(X_train.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# Training
model.train()
for epoch in range(30):
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    if epoch % 5 == 0 or epoch == 29:
        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader.dataset):.4f}")


Epoch 1: Loss = 0.4117
Epoch 6: Loss = 0.3302
Epoch 11: Loss = 0.3233
Epoch 16: Loss = 0.3120
Epoch 21: Loss = 0.3157
Epoch 26: Loss = 0.2913
Epoch 30: Loss = 0.3138


In [12]:
model.eval()
with torch.no_grad():
    pred_scaled = model(X_test_t.to(device)).cpu().numpy()
    preds = scaler_y.inverse_transform(pred_scaled)

print("🔴 DNN Evaluation:")
eval_model(y_test, preds)


🔴 DNN Evaluation:
RMSE: 0.03622
MAE: 0.00816
R²: 0.94978
Mean distance: 1337.08 meters
Median distance: 908.07 meters
