<a href="https://colab.research.google.com/github/ss1705/ai-traffic-system/blob/main/TrafficLLM/Phi_2_SFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q -U bitsandbytes transformers datasets peft trl accelerate

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [5]:
model_name = "microsoft/phi-2"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
model = prepare_model_for_kbit_training(model)

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Tokenization

In [None]:
train_dataset = load_dataset("json", data_files="/content/drive/MyDrive/TrafficLLM/train_split.jsonl", split="train")
eval_dataset = load_dataset("json", data_files="/content/drive/MyDrive/TrafficLLM/eval_split.jsonl", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_dataset[0]

{'instruction': 'You are an expert in traffic analysis.',
 'input': 'Given the following traffic and weather data:\n\n- Severity: 0\n- Latitude, Longitude: (41.910812, -69.986511)\n- Start Time (s): 170674560.0\n- End Time (s): 170675735.0\n- Distance: 1.940000057220459 mi\n- Temperature: 75.0 °F\n- Wind Speed: 20.0 mph\n- Wind Direction: SSW\n- Visibility: 10.0 mi\n- Precipitation: 0.0\n- Weather Event: Unknown\n- Weather Conditions: Fair\n\nPredict:\n1. Expected Delay from Typical Traffic (in minutes)\n2. Congestion Level (Fast / Moderate / Slow)',
 'output': 'Delay from typical traffic: 0.00 minutes\nCongestion level: Fast'}

In [None]:
def format_example(example):
    return {
        "text": f"### Instruction:\n{example['instruction']}\n\n"
                f"### Input:\n{example['input']}\n\n"
                f"### Response:\n{example['output']}"
    }

In [None]:
formatted_train_dataset = train_dataset.map(format_example)
formatted_eval_dataset = eval_dataset.map(format_example)

Map:   0%|          | 0/1800000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

In [None]:
tokenized_train_dataset = formatted_train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = formatted_eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1800000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

In [None]:
import json

with open("/content/drive/MyDrive/TrafficLLM/tokenized_train.jsonl", "w") as f:
    for item in tokenized_train_dataset:
        f.write(json.dumps(item) + "\n")

with open("/content/drive/MyDrive/TrafficLLM/tokenized_eval.jsonl", "w") as f:
    for item in tokenized_eval_dataset:
        f.write(json.dumps(item) + "\n")

## Fetching tokenized data directly

In [10]:
tokenized_train_dataset = load_dataset("json", data_files="/content/drive/MyDrive/TrafficLLM/tokenized_train.jsonl", split="train")
tokenized_eval_dataset = load_dataset("json", data_files="/content/drive/MyDrive/TrafficLLM/tokenized_eval.jsonl", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
small_train_dataset = tokenized_train_dataset.shuffle(seed=42).select(range(10_000))
small_eval_dataset = tokenized_eval_dataset.shuffle(seed=42).select(range(1_000))

In [12]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [13]:
from peft import PeftModel

if not isinstance(model, PeftModel):
    model = get_peft_model(model, lora_config)

In [14]:
training_args = TrainingArguments(
    output_dir="./phi2-traffic-lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    max_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    bf16=False,
    save_total_limit=2,
    report_to="none",
)

In [15]:
trainer = SFTTrainer(
    model=model,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [17]:
trainer.train()

Step,Training Loss
10,0.8854
20,0.6758
30,0.6203
40,0.6067
50,0.5942
60,0.5844
70,0.5742
80,0.5627
90,0.5699
100,0.5535


TrainOutput(global_step=500, training_loss=0.5475047435760498, metrics={'train_runtime': 6399.3759, 'train_samples_per_second': 1.25, 'train_steps_per_second': 0.078, 'total_flos': 6.515670908928e+16, 'train_loss': 0.5475047435760498})

In [18]:
trainer.model.save_pretrained("/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora")
tokenizer.save_pretrained("/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora")

('/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora/tokenizer_config.json',
 '/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora/special_tokens_map.json',
 '/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora/vocab.json',
 '/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora/merges.txt',
 '/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora/added_tokens.json',
 '/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora/tokenizer.json')

In [24]:
base_model_name = "microsoft/phi-2"
fine_tuned_path = "/content/drive/MyDrive/TrafficLLM/phi2-traffic-lora"

In [25]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

In [26]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True,
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
from peft import PeftModel, PeftConfig

model = PeftModel.from_pretrained(base_model, fine_tuned_path)
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear4bit(in_fe

In [32]:
instruction = "You are an expert in traffic analysis."
input_data = """
Given the following traffic and weather data:

- Severity: 2
- Latitude, Longitude: (39.191032, -120.81974)
- Start Time (s): 27350400.0
- End Time (s): 27349070.0
- Distance: 1.4 mi
- Temperature: 54.0 °F
- Wind Speed: 3.5 mph
- Wind Direction: North
- Visibility: 10.0 mi
- Precipitation: Overcast
- Weather Event: Hail
- Weather Conditions: Clear

Predict:
1. Expected Delay from Typical Traffic (in minutes)
2. Congestion Level (Fast / Moderate / Slow)
"""

prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_data}\n\n### Response:\n"


In [33]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

In [34]:
with torch.no_grad():
    outputs = model.generate(
    **inputs,
    max_new_tokens=64,  # Reduce from 100
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.2,  # Helps reduce repeats
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Instruction:
You are an expert in traffic analysis.

### Input:

Given the following traffic and weather data:

- Severity: 2
- Latitude, Longitude: (39.191032, -120.81974)
- Start Time (s): 27350400.0
- End Time (s): 27349070.0
- Distance: 1.4 mi
- Temperature: 54.0 °F
- Wind Speed: 3.5 mph
- Wind Direction: North
- Visibility: 10.0 mi
- Precipitation: Overcast
- Weather Event: Hail
- Weather Conditions: Clear

Predict:
1. Expected Delay from Typical Traffic (in minutes)
2. Congestion Level (Fast / Moderate / Slow)


### Response:
Delay from typical traffic: 0.00 minutes
Congestion level: Fast



In [35]:
raw_dataset = load_dataset("json", data_files="/content/drive/MyDrive/TrafficLLM/train_split.jsonl", split="train")

# Create list of prompt dicts
inference_prompts = [
    {
        "instruction": example["instruction"],
        "input": example["input"]
    }
    for example in raw_dataset
]

Generating train split: 0 examples [00:00, ? examples/s]

In [36]:
import random

sampled_prompts = random.sample(inference_prompts, 5)
results = []

for prompt in sampled_prompts:
    input_text = f"### Instruction:\n{prompt['instruction']}\n\n### Input:\n{prompt['input']}\n\n### Response:"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    output = model.generate(
        **inputs,
        max_new_tokens=64,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    response = decoded.split("### Response:")[-1].strip()
    results.append({
        "instruction": prompt["instruction"],
        "input": prompt["input"],
        "response": response
    })

In [37]:
results

[{'instruction': 'You are an expert in traffic analysis.',
  'input': 'Given the following traffic and weather data:\n\n- Severity: 2\n- Latitude, Longitude: (41.34512700000001, -72.400116)\n- Start Time (s): 30974820.0\n- End Time (s): 30973433.0\n- Distance: 2.9 mi\n- Temperature: 24.8 °F\n- Wind Speed: 4.6 mph\n- Wind Direction: NE\n- Visibility: 5.0 mi\n- Precipitation: Unknown\n- Weather Event: Snow\n- Weather Conditions: Light Snow\n\nPredict:\n1. Expected Delay from Typical Traffic (in minutes)\n2. Congestion Level (Fast / Moderate / Slow)',
  'response': 'Delay from typical traffic: 0.28 minutes\nCongestion level: Moderate\n\n### Explanation:\nDelay from typical traffic is 0.28 minutes. Congestion level is moderate.'},
 {'instruction': 'You are an expert in traffic analysis.',
  'input': 'Given the following traffic and weather data:\n\n- Severity: 2\n- Latitude, Longitude: (32.452183000000005, -80.990135)\n- Start Time (s): 91093920.0\n- End Time (s): 91094195.0\n- Distance: 0