In [7]:
!pip install pymongo sentence-transformers xgboost lightgbm



In [8]:
# 2. Imports
from pymongo import MongoClient
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
import joblib

In [9]:
# ============================ #
#      2. Connect to MongoDB   #
# ============================ #

username = "salma"
password = "f3ekNjEhzec0cYkx"
cluster = "production.g8vjv.mongodb.net"
uri = f"mongodb+srv://{username}:{password}@{cluster}/?retryWrites=true&w=majority"

client = MongoClient(uri)
db = client["heroku_v801wdr2"]
collection = db["addresses"]

In [10]:


# ============================ #
#    3. Data Cleaning/Prep     #
# ============================ #

ABBREVIATION_MAP = {
    "st": "street", "rd": "road", "ave": "avenue", "blvd": "boulevard",
    "blk": "block", "bldg": "building", "apt": "apartment", "flr": "floor",
    "mohd": "mohammed", "bn": "bin", "hwy": "highway"
}

def normalize_text(text):
    words = text.lower().split()
    return " ".join([ABBREVIATION_MAP.get(word, word) for word in words])

def build_address(row):
    parts = [
        row.get("country", ""), row.get("area") or row.get("city", ""),
        f"block {row.get('block')}" if row.get("block") else "",
        row.get("street", ""), row.get("buildingNumber", ""),
        row.get("apartment", ""), row.get("floor", "")
    ]
    return normalize_text(" ".join([str(p) for p in parts if p]))

def get_lat_lon(loc):
    return loc.get("latitude"), loc.get("longitude") if isinstance(loc, dict) else (None, None)


In [11]:

# ========== Download and Clean Data ========= #
def load_data(collection, limit=100_000):
    cursor = collection.find({
        "deleted": False,
        "rawStreet": False,
        "location.latitude": {"$exists": True},
        "location.longitude": {"$exists": True}
    }, {
        "_id": 0, "country": 1, "area": 1, "city": 1, "block": 1,
        "street": 1, "buildingNumber": 1, "apartment": 1, "floor": 1,
        "location": 1
    }).limit(limit)

    df = pd.DataFrame(list(cursor))
    df["input_text"] = df.apply(build_address, axis=1)
    df[["latitude", "longitude"]] = df["location"].apply(lambda x: pd.Series(get_lat_lon(x)))
    df = df[["input_text", "latitude", "longitude"]].dropna().drop_duplicates()
    return df

df = load_data(collection)
print("ðŸ“Š Data loaded:", df.shape)


ðŸ“Š Data loaded: (77797, 3)


In [12]:
# 6. Sentence embeddings
model_name = "paraphrase-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

batch_size = 512
embeddings = []
for i in tqdm(range(0, len(df), batch_size)):
    batch = df["input_text"].iloc[i:i+batch_size].tolist()
    emb = embedder.encode(batch, batch_size=64, show_progress_bar=False)
    embeddings.append(emb)
X = np.vstack(embeddings)
y = df[["latitude", "longitude"]].values

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 152/152 [12:28<00:00,  4.92s/it]


In [13]:
# 7. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Training

In [14]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

xgb_model = MultiOutputRegressor(
    XGBRegressor(
        objective='reg:squarederror',
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        n_jobs=-1,
        random_state=42
    )
)

xgb_model.fit(X_train, y_train)

# Predict on test set
y_pred = xgb_model.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("XGBoost results:")
print(f"RMSE: {rmse:.5f}")
print(f"MAE: {mae:.5f}")
print(f"R^2: {r2:.5f}")


XGBoost results:
RMSE: 0.05611
MAE: 0.01969
R^2: 0.87808


In [15]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
import numpy as np

lgbm_model = MultiOutputRegressor(
    LGBMRegressor(
        n_estimators=100,
        num_leaves=31,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1
    )
)

lgbm_model.fit(X_train, y_train)

# Predict on test set
y_pred = lgbm_model.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("LightGBM results:")
print(f"RMSE: {rmse:.5f}")
print(f"MAE: {mae:.5f}")
print(f"R^2: {r2:.5f}")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.940138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 62237, number of used features: 384
[LightGBM] [Info] Start training from score 29.263305




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.890514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 62237, number of used features: 384
[LightGBM] [Info] Start training from score 48.006480




LightGBM results:
RMSE: 0.06075
MAE: 0.02054
R^2: 0.85714


In [16]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

# Prepare data as tensors
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

# Define simple feedforward network
class GeoNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )

    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GeoNN(X_train.shape[1]).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training loop
epochs = 30
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * xb.size(0)
    epoch_loss /= len(train_loader.dataset)
    if (epoch + 1) % 5 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.6f}")

# Evaluation
model.eval()
with torch.no_grad():
    preds = model(X_test_t.to(device)).cpu().numpy()

rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

print("Deep Neural Network results:")
print(f"RMSE: {rmse:.5f}")
print(f"MAE: {mae:.5f}")
print(f"R^2: {r2:.5f}")


Epoch 1/30 - Loss: 86.573105
Epoch 5/30 - Loss: 5.219129
Epoch 10/30 - Loss: 2.760385
Epoch 15/30 - Loss: 0.981253
Epoch 20/30 - Loss: 0.119535
Epoch 25/30 - Loss: 0.069499
Epoch 30/30 - Loss: 0.032681
Deep Neural Network results:
RMSE: 0.08091
MAE: 0.04089
R^2: 0.74502
