In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load your CSV with 'text', 'latitude', 'longitude'
df = pd.read_csv("full_markup_v2.csv")[['description', 'lat', 'lng']]


In [2]:
import pandas as pd

# Load your CSV with 'text', 'latitude', 'longitude'
test = pd.read_csv("test_markup.csv", index_col=0).reset_index(drop=True)

In [42]:
test_dataset = Dataset.from_pandas(test)

In [16]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [43]:
def tokenize(example):
    return tokenizer(example["description"], padding="max_length", truncation=True, max_length=128)

test_dataset = test_dataset.map(tokenize, batched=True)


Map:   0%|          | 0/1960 [00:00<?, ? examples/s]

In [44]:
import torch
def add_labels(example):
    example["labels"] = torch.tensor([example["lat"], example["lng"]], dtype=torch.float)
    return example

test_dataset = test_dataset.map(add_labels)
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/1960 [00:00<?, ? examples/s]

In [19]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [26]:
from transformers import TrainingArguments, Trainer, BertTokenizerFast, DataCollatorWithPadding, BertConfig, BertPreTrainedModel, BertModel
import torch
from torch import nn

class BoundedLatLonRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 2),
            nn.Tanh()  # Output in [-1, 1]
        )

    def forward(self, x):
        latlon = self.mlp(x)
        lat = latlon[:, 0] * 90     # [-1, 1] → [-90, 90]
        lon = latlon[:, 1] * 180    # [-1, 1] → [-180, 180]
        return torch.stack([lat, lon], dim=1)

class BertForLatLonCosineLoss(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.regressor = BoundedLatLonRegressor(config.hidden_size)
        self.init_weights()
        for param in self.bert.parameters():
                param.requires_grad = True

    def latlon_to_xyz(self, latlon):
        lat_rad = torch.deg2rad(latlon[:, 0])
        lon_rad = torch.deg2rad(latlon[:, 1])
        x = torch.cos(lat_rad) * torch.cos(lon_rad)
        y = torch.cos(lat_rad) * torch.sin(lon_rad)
        z = torch.sin(lat_rad)
        return torch.stack([x, y, z], dim=1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        latlon_pred = self.regressor(pooled_output)

        loss = None
        if labels is not None:
            pred_xyz = self.latlon_to_xyz(latlon_pred)
            true_xyz = self.latlon_to_xyz(labels)
            cosine_loss = 1 - (pred_xyz * true_xyz).sum(dim=1)
            loss = cosine_loss.mean()

        return {"loss": loss, "logits": latlon_pred}


model_path = "./latlon_model_v4/checkpoint-30955"
config = BertConfig.from_pretrained(model_path)
model = BertForLatLonCosineLoss.from_pretrained(model_path, config=config)

training_args = TrainingArguments(
    output_dir='predicts',
    do_eval='no',
    remove_unused_columns=False,
    per_device_eval_batch_size=512
)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


data_collator = DataCollatorWithPadding(
    tokenizer,
    padding="longest",
    max_length=256,
    pad_to_multiple_of=8
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
)

In [27]:
predicts = trainer.predict(test_dataset)

In [28]:
predicts.predictions

array([[  37.833477 , -120.98359  ],
       [   4.8686676,  -74.883484 ],
       [  24.378277 ,  110.39495  ],
       ...,
       [  44.338493 ,   11.458629 ],
       [  35.92471  ,  114.1914   ],
       [  20.514193 , -101.00489  ]], shape=(1960, 2), dtype=float32)

In [29]:
test['pred_lat'] = predicts.predictions[:, 0]
test['pred_lng'] = predicts.predictions[:, 1]

In [34]:
test.to_csv('test_preds.csv')

In [45]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("k4tel/geo-bert-multilingual")
model = AutoModel.from_pretrained("k4tel/geo-bert-multilingual")

In [46]:
data_collator = DataCollatorWithPadding(
    tokenizer,
    padding="longest",
    max_length=256,
    pad_to_multiple_of=8
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
)

In [47]:
predicts = trainer.predict(test_dataset)

In [49]:
predicts.predictions

(array([[[-0.14192955, -0.41838244,  0.11701819, ...,  0.7798312 ,
          -0.42147642,  0.43597618],
         [-0.15586838, -0.42848715,  0.14434335, ...,  0.77767444,
          -0.4396696 ,  0.41985932],
         [-0.15230465, -0.42344218,  0.1454553 , ...,  0.7737306 ,
          -0.42943978,  0.41905805],
         ...,
         [-0.14339434, -0.40997997,  0.13614056, ...,  0.7778407 ,
          -0.43589255,  0.42625588],
         [-0.14309828, -0.4170205 ,  0.1212741 , ...,  0.77945316,
          -0.4250329 ,  0.43297344],
         [-0.14310706, -0.41786584,  0.12008908, ...,  0.77878135,
          -0.4232025 ,  0.43329704]],
 
        [[-0.74678326, -1.0637821 ,  0.93283343, ..., -0.93808246,
          -0.20170006, -0.44278747],
         [-0.6879488 , -0.97236496,  0.7981638 , ..., -0.7724738 ,
          -0.36104044, -0.59423846],
         [-0.63795894, -0.9278943 ,  0.8627004 , ..., -0.84266746,
          -0.29186046, -0.4959944 ],
         ...,
         [-0.7379645 , -1.03387  

In [48]:
test['pred_lat_k4tl'] = predicts.predictions[:, 0]
test['pred_lng_k4tl'] = predicts.predictions[:, 1]

TypeError: tuple indices must be integers or slices, not tuple

In [5]:
# Importing the geodesic module from the library
from geopy.distance import geodesic
import pandas as pd

our_preds = pd.read_csv('test_preds.csv')
their_preds = pd.read_csv('test_k4tel_preds.csv')
gpt_preds = pd.read_csv('gpt41nano_test.csv')
test = pd.read_csv('test_markup.csv')

In [6]:
our_lat = our_preds['pred_lat']
our_lng = our_preds['pred_lng']

their_lat = their_preds['pred_lat']
their_lng = their_preds['pred_lng']

gpt_lat = gpt_preds['ans_lat']
gpt_lng = gpt_preds['ans_log']

In [7]:
our_dists = []
their_dists = []
gpt_dists = []

for i in range(len(test)):
    our_dists.append(geodesic((our_lat[i], our_lng[i]), (test.loc[i, 'lat'], test.loc[i, 'lng'])))
    their_dists.append(geodesic((their_lat[i], their_lng[i]), (test.loc[i, 'lat'], test.loc[i, 'lng'])))
    gpt_dists.append(geodesic((gpt_lat[i], gpt_lng[i]), (test.loc[i, 'lat'], test.loc[i, 'lng'])))

In [8]:
import numpy as np
np.median(our_dists), np.median(their_dists), np.median(gpt_dists)

(Distance(342.7871302851296),
 Distance(474.16477008829736),
 Distance(284.9515490008634))

In [9]:
np.mean(our_dists), np.mean(their_dists), np.mean(gpt_dists)

(Distance(1150.3924263127833),
 Distance(2129.9926089978476),
 Distance(1201.5600113365979))