In [1]:
import os
from pathlib import Path
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedGroupKFold
from fastai.vision.all import *

import wandb
import warnings
warnings.filterwarnings("ignore")

In [2]:
class Config:
    WANDB_PROJECT = "mlops-course-001"
    ENTITY = None # set this to team name if working in a team
    BDD_CLASSES = {i: c for i, c in enumerate(['background', 'road', 'traffic light', 'traffic sign',
                                              'person', 'vehicle', 'bicycle'])}
    RAW_DATA_AT = 'bdd_simple_1k'
    PROCESSED_DATA_AT = 'bdd_simple_1k_split'
    
PARAMS = Config()

In [3]:
run = wandb.init(
    project=PARAMS.WANDB_PROJECT,
    entity=PARAMS.ENTITY,
    job_type='data_split'
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msamu2505[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
raw_data_at = run.use_artifact(f"{PARAMS.RAW_DATA_AT}:latest")
path = Path(raw_data_at.download())

[34m[1mwandb[0m: Downloading large artifact bdd_simple_1k:latest, 846.59MB. 4007 files... Done. 0:0:0.7


In [5]:
orig_eda_table = raw_data_at.get("eda_table")

[34m[1mwandb[0m: Downloading large artifact bdd_simple_1k:latest, 846.59MB. 4007 files... Done. 0:0:0.7


In [6]:
path.ls()

(#5) [Path('artifacts/bdd_simple_1k:v2/images'),Path('artifacts/bdd_simple_1k:v2/LICENSE.txt'),Path('artifacts/bdd_simple_1k:v2/eda_table.table.json'),Path('artifacts/bdd_simple_1k:v2/labels'),Path('artifacts/bdd_simple_1k:v2/media')]

In [7]:
fnames = os.listdir(path/'images') # USE FILENAMES FROM EDA TABLE
groups = [s.split('-')[0] for s in tqdm(fnames)]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
y = orig_eda_table.get_column('bicycle')

Now, we'll split the data into train(80%), validation(10%) amd test(10%) sets. As we do that, we need to be careful to:

   <li>avoid leakage: for that reason we're grouping data according to video identifier (we want to make sure our model can generalize to new cars or video frames)</li>
   
   <li>handle the label imbalance: for that reason we stratify data with our target column</li>
   
We will use sklearn's ```StratifiedGroupKFold``` to split the data into 10 folds and assign 1 fold for test, 1 for validation and the rest for training

In [9]:
df = pd.DataFrame()
df['File_Name'] = fnames
df['fold'] = -1

In [10]:
cv = StratifiedGroupKFold(n_splits=10)
for i, (tr_idx, val_idx) in enumerate(tqdm(cv.split(fnames, y, groups), desc='Creating folds', total=10)):
    df.loc[val_idx, 'fold'] = i

Creating folds:   0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
df['stage'] = 'train'
df.loc[df.fold == 0, ['stage']] = 'test'
df.loc[df.fold == 1, ['stage']] = 'valid'

del df['fold']
df.stage.value_counts()

train    800
valid    100
test     100
Name: stage, dtype: int64

In [12]:
df.to_csv('data_split.csv', index=False)

We'll now create a new artifact and add our data there

In [13]:
processed_data_at = wandb.Artifact(PARAMS.PROCESSED_DATA_AT, type='split_data')

processed_data_at.add_file('data_split.csv')
processed_data_at.add_dir(path)

[34m[1mwandb[0m: Adding directory to artifact (./artifacts/bdd_simple_1k:v2)... Done. 2.2s


Finally, the split information may be relevant for our analyses - rather than uploading images again, we will save the split information to a new table and join it with EDA table we created previously.

In [15]:
data_split_table = wandb.Table(dataframe=df[['File_Name', 'stage']])

In [16]:
join_table = wandb.JoinedTable(orig_eda_table, data_split_table, "FileName")

In [17]:
# add it to artifact, log and run 
processed_data_at.add(join_table, "eda_table_split")

<ManifestEntry digest: 0n/ZULfus5x9ayq8LDiIIA==>

In [18]:
run.log_artifact(processed_data_at)
run.finish()

VBox(children=(Label(value='846.067 MB of 846.067 MB uploaded (846.005 MB deduped)\r'), FloatProgress(value=1.…