In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Change this if you put the shortcut somewhere else
BASE = "/content/drive/MyDrive/255-GroupProject"

TRAIN_JSONL = f"{BASE}/FindVehicle_train.jsonl"   # your train.jsonl
OUT_CSV      = f"{BASE}/FindVehicle_train.csv"     # output CSV you want


In [None]:
import json, csv
from pathlib import Path

def jsonl_to_token_csv(in_path, out_path):
    """
    Writes a CSV with columns: Description, token, tag.
    tag = "0" for non-entity tokens; entity spans become B/I/E-<type>.
    Assumes:
      ex["data"] is the sentence
      ex["ner_label"] entries look like:
        [etype, char_start, char_end, surface, tok_start, tok_end, variants]
      where token span is [tok_start, tok_end) in token indices.
    Tokenization: text.split() (your data has spaced punctuation, so this matches the TXT).
    """
    in_path = Path(in_path)
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    with in_path.open("r", encoding="utf-8") as fin, \
         out_path.open("w", newline="", encoding="utf-8-sig") as fout:
        writer = csv.DictWriter(fout, fieldnames=["Description", "token", "tag"])
        writer.writeheader()

        for line in fin:
            line = line.strip()
            if not line:
                continue
            ex = json.loads(line)

            text = ex["data"]
            tokens = text.split()
            tags = ["0"] * len(tokens)  # your requested default tag

            for lbl in ex.get("ner_label", []):
                # Expected list format; includes token start/end
                # [etype, char_start, char_end, surface, tok_start, tok_end, variants]
                if isinstance(lbl, list) and len(lbl) >= 6:
                    etype = lbl[0]
                    ts = lbl[4]
                    te = lbl[5]
                elif isinstance(lbl, dict):
                    etype = lbl.get("type") or lbl.get("etype")
                    ts = lbl.get("tok_start") or lbl.get("ts")
                    te = lbl.get("tok_end") or lbl.get("te")
                else:
                    continue

                if etype is None or ts is None or te is None:
                    continue

                span_len = te - ts
                if span_len <= 0:
                    continue

                if span_len == 1:
                    if 0 <= ts < len(tags):
                        tags[ts] = f"B-{etype}"
                else:
                    if 0 <= ts < len(tags):
                        tags[ts] = f"B-{etype}"
                    for i in range(ts + 1, te - 1):
                        if 0 <= i < len(tags):
                            tags[i] = f"I-{etype}"
                    if 0 <= te - 1 < len(tags):
                        tags[te - 1] = f"E-{etype}"

            # one row per token
            for tok, tag in zip(tokens, tags):
                writer.writerow({
                    "Description": text,
                    "token": tok,
                    "tag": tag
                })

# Run it for your train file
jsonl_to_token_csv(TRAIN_JSONL, OUT_CSV)
print("Wrote:", OUT_CSV)


Wrote: /content/drive/MyDrive/255-GroupProject/FindVehicle_train.csv


In [None]:
# --- 1) Mount Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- 2) Paths (edit if your shortcut path is different) ---
BASE = "/content/drive/MyDrive/255-GroupProject"
IN_TEST_JSONL = f"{BASE}/FindVehicle_test.jsonl"
OUT_TEST_CSV  = f"{BASE}/FindVehicle_test.csv"

# --- 3) Converter (reuse for any split) ---
import json, csv
from pathlib import Path

def jsonl_to_token_csv(in_path, out_path):
    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    with open(in_path, "r", encoding="utf-8") as fin, \
         open(out_path, "w", newline="", encoding="utf-8-sig") as fout:
        writer = csv.DictWriter(fout, fieldnames=["Description", "token", "tag"])
        writer.writeheader()

        for line in fin:
            line = line.strip()
            if not line:
                continue
            ex = json.loads(line)
            text = ex["data"]
            tokens = text.split()              # punctuation already spaced
            tags = ["0"] * len(tokens)         # default tag = "0"

            # Apply NER spans if present
            for lbl in ex.get("ner_label", []):
                # Expected list: [etype, char_start, char_end, surface, tok_start, tok_end, ...]
                if isinstance(lbl, list) and len(lbl) >= 6:
                    etype, ts, te = lbl[0], lbl[4], lbl[5]
                elif isinstance(lbl, dict):     # fallback for dict labels
                    etype = lbl.get("type") or lbl.get("etype")
                    ts, te = lbl.get("tok_start"), lbl.get("tok_end")
                else:
                    continue
                if etype is None or ts is None or te is None or te <= ts:
                    continue

                # BIOE tagging
                tags[ts] = f"B-{etype}"
                for i in range(ts + 1, te - 1):
                    if 0 <= i < len(tags):
                        tags[i] = f"I-{etype}"
                last = te - 1
                if 0 <= last < len(tags):
                    tags[last] = f"E-{etype}"

            for tok, tag in zip(tokens, tags):
                writer.writerow({"Description": text, "token": tok, "tag": tag})

# --- 4) Run for TEST ---
jsonl_to_token_csv(IN_TEST_JSONL, OUT_TEST_CSV)
print("✅ Wrote:", OUT_TEST_CSV)

# (Optional) peek
import pandas as pd
pd.read_csv(OUT_TEST_CSV).head(25)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Wrote: /content/drive/MyDrive/255-GroupProject/FindVehicle_test.csv


Unnamed: 0,Description,token,tag
0,Let the wise man assist me to find out the Sil...,Let,0
1,Let the wise man assist me to find out the Sil...,the,0
2,Let the wise man assist me to find out the Sil...,wise,0
3,Let the wise man assist me to find out the Sil...,man,0
4,Let the wise man assist me to find out the Sil...,assist,0
5,Let the wise man assist me to find out the Sil...,me,0
6,Let the wise man assist me to find out the Sil...,to,0
7,Let the wise man assist me to find out the Sil...,find,0
8,Let the wise man assist me to find out the Sil...,out,0
9,Let the wise man assist me to find out the Sil...,the,0
