In [2]:
#OPENING SPEEDBANDS data
import requests
import pandas as pd
url = "http://datamall2.mytransport.sg/ltaodataservice/v3/TrafficSpeedBands"

payload = {}
headers = {
  'AccountKey': '9jyOck1VT72vbXMlvCHlRw=='
}

try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()['value']
        
        # Extract relevant attributes from the data
        speed_data = []
        for item in data:
            LinkID = item['LinkID']
            RoadName = item['RoadName']
            RoadCategory = item['RoadCategory']
            SpeedBand = item['SpeedBand']
            MinimumSpeed = item['MinimumSpeed']
            MaximumSpeed = item['MaximumSpeed']
            StartLon = item['StartLon']
            StartLat = item['StartLat']
            EndLon = item['EndLon']
            EndLat = item['EndLat']
            speed_data.append({'LinkID': LinkID,
                                    'RoadName': RoadName,
                                    'RoadCategory': RoadCategory,
                                    'SpeedBand': SpeedBand,
                                    'MinimumSpeed': MinimumSpeed,
                                    'MaximumSpeed': MaximumSpeed,
                                    'StartLon': StartLon,
                                    'StartLat': StartLat,
                                    'EndLon': EndLon,
                                    'EndLat': EndLat})

        speed = pd.DataFrame(speed_data)
        print(speed)
    else:
        print("Error:", response.status_code)
except Exception as e:
    print("Error occurred:", str(e))

        LinkID             RoadName RoadCategory  SpeedBand MinimumSpeed  \
0    103000000            KENT ROAD            E          8           70   
1    103000010         BUCKLEY ROAD            E          3           20   
2    103000011         BUCKLEY ROAD            E          3           20   
3    103000014      SHREWSBURY ROAD            E          5           40   
4    103000015      SHREWSBURY ROAD            E          8           70   
..         ...                  ...          ...        ...          ...   
495  103001218       DRAYCOTT DRIVE            D          3           20   
496  103001219       DRAYCOTT DRIVE            D          3           20   
497  103001226      PADANG JERINGAU            D          4           30   
498  103001227      PADANG JERINGAU            D          5           40   
499  103001230  GUILLEMARD CRESCENT            E          4           30   

    MaximumSpeed            StartLon            StartLat              EndLon  \
0      

In [4]:
#ENCODE THE SPEEDBANDS DATA 
from sklearn.preprocessing import LabelEncoder
from transformers import GPT2Tokenizer
import torch

# Tokenize text data using GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Tokenize road names using GPT-2 tokenizer
tokenized_road_names = [tokenizer(RoadName)['input_ids'] for RoadName in speed['RoadName']]

# Tokenize road categories using GPT-2 tokenizer
tokenized_road_categories = [tokenizer(RoadCategory)['input_ids'] for RoadCategory in speed['RoadCategory']]

# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()
encoded_road_names = label_encoder.fit_transform(speed['RoadName'])
encoded_road_names = torch.tensor(encoded_road_names, dtype=torch.long)
encoded_road_categories = label_encoder.fit_transform(speed['RoadCategory'])
encoded_road_categories = torch.tensor(encoded_road_categories, dtype=torch.long)

# Combine encoded features with tokenized descriptions
encoded_data = [{'road_name': encoded_road_name, 
                 'road_category': encoded_road_category, 
                 'road_name_tokens': road_name_tokens,
                 'road_category_tokens': road_category_tokens}
                for encoded_road_name, encoded_road_category, road_name_tokens, road_category_tokens 
                in zip(encoded_road_names, encoded_road_categories, tokenized_road_names, tokenized_road_categories)]

print(encoded_data)

[{'road_name': tensor(66), 'road_category': tensor(4), 'road_name_tokens': [42, 3525, 15107, 2885], 'road_category_tokens': [36]}, {'road_name': tensor(21), 'road_category': tensor(4), 'road_name_tokens': [33, 16696, 25173, 15107, 2885], 'road_category_tokens': [36]}, {'road_name': tensor(21), 'road_category': tensor(4), 'road_name_tokens': [33, 16696, 25173, 15107, 2885], 'road_category_tokens': [36]}, {'road_name': tensor(112), 'road_category': tensor(4), 'road_name_tokens': [9693, 2200, 54, 16811, 4261, 56, 15107, 2885], 'road_category_tokens': [36]}, {'road_name': tensor(112), 'road_category': tensor(4), 'road_name_tokens': [9693, 2200, 54, 16811, 4261, 56, 15107, 2885], 'road_category_tokens': [36]}, {'road_name': tensor(50), 'road_category': tensor(4), 'road_name_tokens': [41, 1847, 1565, 509, 1581, 5673], 'road_category_tokens': [36]}, {'road_name': tensor(50), 'road_category': tensor(4), 'road_name_tokens': [41, 1847, 1565, 509, 1581, 5673], 'road_category_tokens': [36]}, {'roa

In [5]:
from transformers import DistilBertTokenizer, DistilBertModel

# Load pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Preprocess data and format it for input to the LLM
# Example: Encoding multiple columns from your dataset
road_names = speed['RoadName'].tolist()
road_categories = speed['RoadCategory'].tolist()  # Example additional column
# Encode each column separately
encoded_inputs_road_names = tokenizer(road_names, padding=True, truncation=True, return_tensors='pt')
encoded_inputs_road_categories = tokenizer(road_categories, padding=True, truncation=True, return_tensors='pt')

# Load pre-trained DistilBERT model
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Set the model to evaluation mode
model.eval()

# Input the encoded data into the LLM to extract meaningful insights and patterns from the text data
with torch.no_grad():
    outputs_road_names = model(**encoded_inputs_road_names)
    outputs_road_categories = model(**encoded_inputs_road_categories)

# Extract the representations of the input text from the model outputs
representations_road_names = outputs_road_names.last_hidden_state
representations_road_categories = outputs_road_categories.last_hidden_state
# Depending on your specific task, you may need to further process these representations
# For example, you can use the outputs as features for downstream tasks like classification or clustering
print(representations_road_names)
print(representations_road_categories)

tensor([[[-1.7321e-01, -2.0780e-02, -8.3952e-02,  ..., -6.1511e-02,
           1.9961e-02,  4.1058e-01],
         [-5.7101e-02,  8.0580e-02, -1.0489e-01,  ...,  1.4193e-01,
          -1.5675e-01, -1.3676e-01],
         [ 4.0371e-01,  8.4698e-02,  1.5834e-01,  ..., -2.4152e-02,
          -9.4326e-02, -2.9225e-01],
         ...,
         [-5.1021e-02,  1.3541e-01,  7.5495e-02,  ..., -2.3784e-02,
          -2.2781e-01,  1.6437e-01],
         [-7.8572e-02,  1.1041e-01,  8.6304e-02,  ..., -3.9145e-02,
          -2.2233e-01,  4.5542e-03],
         [-2.1940e-02,  5.1970e-02,  2.8353e-02,  ...,  2.7094e-02,
          -4.1015e-01,  1.4233e-01]],

        [[-1.7039e-01,  1.5440e-01, -5.6000e-02,  ..., -1.4073e-01,
           1.8495e-02,  3.8234e-01],
         [ 3.0403e-02,  3.8860e-01,  5.9745e-02,  ..., -1.4697e-02,
          -3.3487e-01,  2.1544e-04],
         [ 4.9530e-01,  5.2461e-01,  1.6562e-01,  ..., -1.5266e-01,
          -4.6537e-03, -1.7339e-01],
         ...,
         [ 2.8090e-02,  3

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np

#Encode Road Names
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
inputs = tokenizer(list(speed['RoadName']), padding = True, truncation=True, return_tensors='pt', max_length=512)

#Label Encoding
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(speed['RoadCategory'])
labels = torch.tensor(labels, dtype=torch.long)

# Split data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(inputs['input_ids'], labels, test_size=0.1, random_state=42)

# Convert to tensors
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

  train_inputs = torch.tensor(train_inputs)
  val_inputs = torch.tensor(val_inputs)
  train_labels = torch.tensor(train_labels)
  val_labels = torch.tensor(val_labels)


In [7]:
class SpeedBandsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}  # Problematic line
        item['labels'] = torch.tensor(self.labels[idx])
        return item


# Prepare dataset
train_dataset = SpeedBandsDataset({'input_ids': train_inputs}, train_labels)
val_dataset = SpeedBandsDataset({'input_ids': val_inputs}, val_labels)

# Prepare data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [8]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Load pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=np.unique(labels).shape[0])

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Prepare optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 3
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_loader) * epochs
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def train(model, optimizer, scheduler, train_loader, val_loader, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss}')
        
        # Validation
        model.eval()
        total_val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                total_val_loss += loss.item()

                _, predicted = torch.max(outputs.logits, 1)
                labels = batch['labels']
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        avg_val_loss = total_val_loss / len(val_loader)
        accuracy = correct / total
        print(f'Epoch {epoch+1}/{epochs}, Validation Loss: {avg_val_loss}, Accuracy: {accuracy:.4f}')
    
    
    # Save the model state
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_loss': avg_train_loss,
        'val_loss': avg_val_loss,
        'accuracy': accuracy
    }, 'saved_model.pth')


# Continue training the model
train(model, optimizer, scheduler, train_loader, val_loader, epochs=3)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}  # Problematic line
  item['labels'] = torch.tensor(self.labels[idx])
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1/3, Training Loss: 1.2187856373034025
Epoch 1/3, Validation Loss: 0.6821995292391095, Accuracy: 0.7600
Epoch 2/3, Training Loss: 0.7842186780875189
Epoch 2/3, Validation Loss: 0.40497059907232014, Accuracy: 0.8800
Epoch 3/3, Training Loss: 0.5061690036142081
Epoch 3/3, Validation Loss: 0.32033971164907726, Accuracy: 0.9000


In [12]:
# Load the saved model
checkpoint = torch.load('saved_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
epoch = checkpoint['epoch']
train_loss = checkpoint['train_loss']
val_loss = checkpoint['val_loss']
accuracy = checkpoint['accuracy']

train(model, optimizer, scheduler, train_loader, val_loader, epochs=3)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}  # Problematic line
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1/3, Training Loss: 0.43190269904178485
Epoch 1/3, Validation Loss: 0.32033971164907726, Accuracy: 0.9000
Epoch 2/3, Training Loss: 0.4283063441776393
Epoch 2/3, Validation Loss: 0.32033971164907726, Accuracy: 0.9000
Epoch 3/3, Training Loss: 0.43474215269088745
Epoch 3/3, Validation Loss: 0.32033971164907726, Accuracy: 0.9000


In [15]:
def predict_speed_band(road_name):
    # Tokenize and encode the road name using the same tokenizer as used during training
    encoded_road_name = tokenizer(road_name, padding=True, truncation=True, return_tensors='pt')

    # Perform inference
    with torch.no_grad():
        outputs = model(**encoded_road_name)

    # Obtain the predicted speed band
    predicted_speed_band = torch.argmax(outputs.logits).item()

    return predicted_speed_band

speed_bands_info = {
    1: "Speed range from 0 to 9",
    2: "Speed range from 10 to 19",
    3: "Speed range from 20 to 29",
    4: "Speed range from 30 to 39",
    5: "Speed range from 40 to 49",
    6: "Speed range from 50 to 59",
    7: "Speed range from 60 to 69",
    8: "Speed range from 70 or more"
}
    
# Assume 'road_name' is the road name you want to classify
road_name = "DRAYCOTT DRIVE"
predicted_speed_band = predict_speed_band(road_name)
print(f"The predicted speed band for road name '{road_name}' is '{predicted_speed_band}'.")

The predicted speed band for road name 'DRAYCOTT DRIVE' is '3'.
