In [1]:
!git clone https://github.com/riya21parikh/CarInBikeLane.git

Cloning into 'CarInBikeLane'...
remote: Enumerating objects: 4184, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 4184 (delta 0), reused 0 (delta 0), pack-reused 4181 (from 2)[K
Receiving objects: 100% (4184/4184), 846.04 MiB | 31.17 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Updating files: 100% (4382/4382), done.


In [2]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
import tensorflow as tf
import keras
from keras import layers, models
from keras.preprocessing.image import load_img, img_to_array
import matplotlib.pyplot as plt
import matplotlib
import platform
import time
import PIL
from tqdm import tqdm
import pandas as pd
import re

In [3]:
IMG_HEIGHT = 224
IMG_WIDTH = 224

# Data pre-processing

## Loading and concatenating + removing duplicates/corrupted images

In [4]:
root = "/content/CarInBikeLane/Method1/data/data"
root2 = "/content/CarInBikeLane/Method1/data/data2"
root3 = "/content/CarInBikeLane/Method2/Bikelanes"

# Load all data
blocked2 = os.path.join(root, "blocked2")
blocked3 = os.path.join(root2, "blocked3")

notblocked2 = os.path.join(root, "notblocked2")
notblocked3 = os.path.join(root2, "notblocked3")

blocked = os.path.join(root3, "blocked")
notblocked = os.path.join(root3, "notblocked")

# concatenate them
blocked_folders = [
    os.path.join(root, "blocked2"),
    os.path.join(root2, "blocked3"),
    os.path.join(root3, "blocked"),
]

notblocked_folders = [
    os.path.join(root, "notblocked2"),
    os.path.join(root2, "notblocked3"),
    os.path.join(root3, "notblocked"),
]

blocked_files = []
notblocked_files = []

for folder in blocked_folders:
    blocked_files += [os.path.join(folder, f) for f in os.listdir(folder)]

for folder in notblocked_folders:
    notblocked_files += [os.path.join(folder, f) for f in os.listdir(folder)]

print("Total blocked images: ", len(blocked_files))
print("Total unblocked images: ", len(notblocked_files))

Total blocked images:  2351
Total unblocked images:  1831


In [5]:
def remove_duplicates(file_list):
    seen = set()
    unique = []
    for f in file_list:
        name = os.path.basename(f)
        if name not in seen:
            seen.add(name)
            unique.append(f)
    return unique

def corrupted(path):
    try:
        img = PIL.Image.open(path)
        img.verify()
        return False
    except:
        return True

In [6]:
blocked_files = remove_duplicates(blocked_files)
notblocked_files = remove_duplicates(notblocked_files)

print("Total blocked images (no duplicates): ", len(blocked_files))
print("Total unblocked images (no duplicates): ", len(notblocked_files))

bad = []

for f in tqdm(blocked_files + notblocked_files):
    if corrupted(f):
        bad.append(f)

print("\n")
print("corrupted files:", len(bad))

Total blocked images (no duplicates):  2231
Total unblocked images (no duplicates):  1721


100%|██████████| 3952/3952 [00:01<00:00, 2282.39it/s]



corrupted files: 0





## Data splitting (into folders to be used by models)

### Randomly 70/15/15

In [8]:
import os, random, shutil

BASE = "/content/split_random"

classes = ["blocked", "notblocked"]

if os.path.exists(BASE):
    shutil.rmtree(BASE)

for split in ["train", "val", "test"]:
    for cls in classes:
        os.makedirs(os.path.join(BASE, split, cls), exist_ok=True)

def random_split(files, train_ratio=0.7, val_ratio=0.15):

    random.shuffle(files)

    n = len(files)
    n_train = int(train_ratio * n)
    n_val   = int(val_ratio * n)

    train = files[:n_train]
    val   = files[n_train:n_train+n_val]
    test  = files[n_train+n_val:]

    return train, val, test


train_blocked, val_blocked, test_blocked = random_split(blocked_files)
train_notblocked, val_notblocked, test_notblocked = random_split(notblocked_files)

for f in train_blocked:
    shutil.copy(f, os.path.join(BASE, "train/blocked"))
for f in train_notblocked:
    shutil.copy(f, os.path.join(BASE, "train/notblocked"))

for f in val_blocked:
    shutil.copy(f, os.path.join(BASE, "val/blocked"))
for f in val_notblocked:
    shutil.copy(f, os.path.join(BASE, "val/notblocked"))

for f in test_blocked:
    shutil.copy(f, os.path.join(BASE, "test/blocked"))
for f in test_notblocked:
    shutil.copy(f, os.path.join(BASE, "test/notblocked"))

print("DONE!")
print("Dataset ready at:", BASE)

DONE!
Dataset ready at: /content/split_random


### Split by camera (test only on camera 68 and 70/30 split for train/val)

In [11]:
import os, random, shutil

BASE = "/content/split_camera"

classes = ["blocked", "notblocked"]

if os.path.exists(BASE):
    shutil.rmtree(BASE)

for split in ["train", "val", "test"]:
    for cls in classes:
        os.makedirs(os.path.join(BASE, split, cls), exist_ok=True)


def is_cam68(path):
    return "cam68" in os.path.basename(path)


test_blocked      = [f for f in blocked_files if is_cam68(f)]
test_notblocked   = [f for f in notblocked_files if is_cam68(f)]

blocked_remaining     = [f for f in blocked_files if f not in test_blocked]
notblocked_remaining  = [f for f in notblocked_files if f not in test_notblocked]


def split_70_30(files):
    random.shuffle(files)
    n = len(files)
    n_train = int(0.7 * n)
    train = files[:n_train]
    val   = files[n_train:]
    return train, val


train_blocked, val_blocked = split_70_30(blocked_remaining)
train_notblocked, val_notblocked = split_70_30(notblocked_remaining)


for f in train_blocked:
    shutil.copy(f, os.path.join(BASE, "train/blocked"))
for f in train_notblocked:
    shutil.copy(f, os.path.join(BASE, "train/notblocked"))

for f in val_blocked:
    shutil.copy(f, os.path.join(BASE, "val/blocked"))
for f in val_notblocked:
    shutil.copy(f, os.path.join(BASE, "val/notblocked"))

for f in test_blocked:
    shutil.copy(f, os.path.join(BASE, "test/blocked"))
for f in test_notblocked:
    shutil.copy(f, os.path.join(BASE, "test/notblocked"))


print("DONE!")
print("Dataset created at:", BASE)
print()
print("blocked: train",len(train_blocked), "val",len(val_blocked), "test",len(test_blocked))
print("notblocked: train",len(train_notblocked), "val",len(val_notblocked), "test",len(test_notblocked))

DONE!
Dataset created at: /content/split_camera

blocked: train 1554 val 666 test 11
notblocked: train 1185 val 509 test 27


## Images as Arrays of RGB (if needed)

In [7]:
images = []
labels = []

print("Loading blocked images")
for img_path in blocked_files:
    img = load_img(img_path, target_size=(IMG_HEIGHT, IMG_WIDTH))
    img_array = img_to_array(img)
    images.append(img_array)
    labels.append(1)

print("Loading notblocked images")
for img_path in notblocked_files:
    img = load_img(img_path, target_size=(IMG_HEIGHT, IMG_WIDTH))
    img_array = img_to_array(img)
    images.append(img_array)
    labels.append(0)

X = np.array(images)
y = np.array(labels)

X = X / 255.0

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print("Train:", X_train.shape)
print("Val:",   X_val.shape)
print("Test:",  X_test.shape)

Loading blocked images
Loading notblocked images
Train: (2766, 224, 224, 3)
Val: (593, 224, 224, 3)
Test: (593, 224, 224, 3)


**Final data**:

- *blocked_files* and *notblocked_files* are lists of the full path to the images
- *split_random* is a folder that splits the previous lists of images into train/val/test randomly, separated between class
- *split_camera* is a folder that splits the previous lists of images into train/val/test by separating images from Cam68 into test and all other cameras into train/val
- *X_train, X_val, X_test, y_train, y_val* and *y_test* is a split of the data that was previously converted into arrays of 224x224 RGB (not used in baseline)