# Experiment all data to dichotomous classificationabs

## Data

- use augmented data:
  - rotated: all
  - shear: all
  - flip: all
- classify:
  - leaves
  - non-leaves

First group the original data (not augmented) into

- Training data
- Validation data
- Testing data

Then augmented data from...

- training data is used for training.
- validation data is used for training.
- test data is not used.

| data | augmented? | usage |
| :-- | :-: | :-: |
| train original              | no  | training |
| train original -> augmented | yes | training |
| val original                | no  | validation |
| val original -> augmented   | yes | training |
| test original               | no  | testing |
| test original -> augmented  | yes | not used |



In [20]:
import os
from pathlib import Path
import random
from pprint import pprint
import json
import shutil

import pandas as pd
import torch
from tqdm.notebook import tqdm

from exp_utils import PointnetPath, PointCloudTable
from rot_shr_flip_dichot import main as run_pn

path_obj = PointnetPath("rot-shr-flip-dichot")

_ = [
    print(att,": ", getattr(path_obj, att))
    for att in dir(path_obj)
    if not att.startswith("__")
    and att != "where_am_i"
]

DATA_ROOT :  /home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data
HERE :  /home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/pointnet-pytorch-jupyter/experiments
PN_REPO :  /home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/pointnet-pytorch-jupyter
QW_REPO :  /home/kuwaharah436/Documents/paprika-paper-2024/qwa-work
data_dir :  /home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/pointnet-pytorch-jupyter/data/rot-shr-flip-dichot
log_dir :  rot-shr-flip-dichot
log_root :  /home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/pointnet-log


## Data Prep

- `qwa-work/`
  - `data/`
    - `split-data/`: from here
  - `pointnet-pytorch-jupyter/`
    - `data/`
      - `template/`: to here
        - `paprika/`
        - `train_test_split/`
        - `synsetoffset2category.txt`

In [2]:
RAW_PATH = path_obj.DATA_ROOT / "split-data"
assert RAW_PATH.exists()
RAW_PATH#, list(RAW_PATH.iterdir())

PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data')

In [3]:
AUG_PATH = path_obj.DATA_ROOT / "augmented"
assert AUG_PATH.exists()
AUG_PATH#, list(AUG_PATH.iterdir())

PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/augmented')

### txt file clumns

| 0 | 1 | 2 | 3 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|
|`x`|`y`|`z`|`nx`|`ny`|`nz`|`label`: 0\|1|

label

- `0`: non-leaf
- `1`: leaf

In [4]:
# Only columns below are left
# Order sensitive; be carefull
export_cols = ["x", "y", "z", "nx", "ny", "nz", "label"]

In [5]:
with open(
    path_obj.DATA_ROOT / "complete-data" / "labels.json", 'r', encoding="utf-8"
) as js_f:
    labels = json.load(js_f)
label_map = {
    labels["marker"]: 0,
    labels["rod"]   : 0,
    labels["stem"]  : 0,
    labels["fruit"] : 0,
    labels["leaves"]: 1,
}
pprint(labels)
pprint(label_map)

{'fruit': 4, 'leaves': 2, 'leaves-fruit': 24, 'marker': 0, 'rod': 1, 'stem': 3}
{0: 0, 1: 0, 2: 1, 3: 0, 4: 0}


### Read and Write Data

### labels

In [6]:
with open(
    path_obj.DATA_ROOT / "complete-data" / "labels.json", "r", encoding="utf-8"
) as js_f:
    labels = json.load(js_f)
pprint(labels)

{'fruit': 4, 'leaves': 2, 'leaves-fruit': 24, 'marker': 0, 'rod': 1, 'stem': 3}


RAW_PATH / paprika id

In [7]:
raw_pprkid = list(RAW_PATH.iterdir())
aug_pprkid = list(AUG_PATH.iterdir())
pprint(raw_pprkid)
pprint(aug_pprkid)

[PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Nagano_0209_2'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Nagano_0316_2'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Artega_0309_1'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Nagano_0309_2'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Nesditt_0316_2'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Nesditt_0209_2'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Trirosso_0209_2'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Nesditt_0316_1'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/split-data/Trirosso_0309_1'),
 PosixPath('/home/kuwaharah436/Documents/paprika-paper-2024/qwa-work/data/s

In [8]:
raw_file_paths = [] # qwa-work/data/split-data/<paprika id>/_*.txt
for pprk_dir in RAW_PATH.iterdir():
    raw_file_paths = raw_file_paths + list(pprk_dir.iterdir())
#raw_file_paths[:10]
len(raw_file_paths)

6418

In [9]:
aug_file_paths = [] # qwa-work/data/augmented/<paprika id>/*.txt
for pprk_dir in AUG_PATH.iterdir():
    aug_file_paths = aug_file_paths + list(pprk_dir.iterdir())
# aug_file_paths[:10]
len(aug_file_paths)

7387118

original data to augmented data ratio

In [10]:
len(aug_file_paths) / len(raw_file_paths)

1151.0

### Move data

Convert data from `/data/split-data` and write to `/pointnet-pytorch-jupyter/data/paprika` (path_obj.data_dir / "paprika").

In [11]:
for f_path in raw_file_paths:
    # skip json
    if f_path.suffix != ".txt":
        continue

    # read and prep data
    raw_df = pd.read_table(f_path, header=0, sep=" ")
    cleaned_df = PointCloudTable.prep(
        raw_df, export_cols, label_map
    )
    if cleaned_df is None:
        continue

    # write
    out_path = path_obj.data_dir / "paprika" / (
        f_path.parent.name + f_path.name
    )
    out_path.parent.mkdir(exist_ok=True, parents=True)
    cleaned_df.to_csv(
        out_path, sep=" ", header=False, index=False
    )

### Train Test Validation split

In [11]:
train_ratio = 0.7
test_ratio  = 0.2
val_ratio   = 0.1

ratio_sum = (train_ratio + test_ratio + val_ratio)
train_ratio = train_ratio / ratio_sum
test_ratio = test_ratio / ratio_sum
val_ratio = val_ratio / ratio_sum
train_ratio, test_ratio, val_ratio

(0.7000000000000001, 0.20000000000000004, 0.10000000000000002)

Data written to `path_obj.data_dir / "paprika"` doesn't include invalid data, i.e. data with `leaves-fruit` label.

Choose train, test, and validation data and copy in the corresponding augmented training/validation data.

In [12]:
train_data = []
test_data = []
val_data = []

txt_ls = os.listdir(path_obj.data_dir / "paprika")
txt_ls = [
    "shape_data/paprika/" + fname.strip(".tx") for fname in txt_ls
    # if "rot" not in fname and "flip" not in fname and "shear" not in fname
]
random.shuffle(txt_ls)
ls_len = len(txt_ls)
train_test_idx = round(ls_len * train_ratio)
test_val_idx = train_test_idx + round(ls_len * test_ratio)
train_data = train_data + txt_ls[:train_test_idx]
test_data = test_data + txt_ls[train_test_idx:test_val_idx]
val_data = val_data + txt_ls[test_val_idx:]
len(train_data), len(test_data), len(val_data)

(3159354, 902673, 451336)

In [13]:
txt_ls[:3], train_data[:3], test_data[:3], val_data[:3]

(['shape_data/paprika/Nagano_0316_2_flip-xyz_shear20_136',
  'shape_data/paprika/Artega_0209_2_rot60_flip-xyz_shear35_312',
  'shape_data/paprika/Artega_0209_2_rot195_flip-xyz_shear35_290'],
 ['shape_data/paprika/Nagano_0316_2_flip-xyz_shear20_136',
  'shape_data/paprika/Artega_0209_2_rot60_flip-xyz_shear35_312',
  'shape_data/paprika/Artega_0209_2_rot195_flip-xyz_shear35_290'],
 ['shape_data/paprika/Artega_0209_2_rot285_shear45_182',
  'shape_data/paprika/Trirosso_0309_1_rot135_shear50_446',
  'shape_data/paprika/Nagano_0316_1_rot330_flip-x_shear10_324'],
 ['shape_data/paprika/Nesditt_0316_1_rot45_shear10_88',
  'shape_data/paprika/Artega_0209_2_rot90_flip-xyz_shear35_194',
  'shape_data/paprika/Nesditt_0316_1_rot345_flip-xyz_shear10_398'])

35 hrs to run.

In [25]:
for f_path in tqdm(aug_file_paths):
    pprk_id = f_path.parent.name
    split_id = f_path.stem.split("_")[-1]

    # # if failed and rerunning this cell
    # if (
    #     path_obj.data_dir / "paprika" / (pprk_id + "_" + f_path.name)
    # ).exists():
    #     continue

    # don't use augmented data for/from testing
    if f"shape_data/paprika/{pprk_id}_{split_id}" in test_data:
        continue

    # convert data
    aug_df = pd.read_table(f_path, header=0, sep=" ")
    cleaned_df = PointCloudTable.prep(
        aug_df, export_cols, label_map
    )
    if cleaned_df is None: # skip invalid label
        continue

    # data should be in train_data or val_data
    fname = pprk_id + "_" + f_path.name
    if not f"shape_data/paprika/{pprk_id}_{split_id}" in val_data:
        if not f"shape_data/paprika/{pprk_id}_{split_id}" in train_data:
            raise ValueError(
                f"Data absent from train, test, and val:{f_path}"
            )

    # write
    cleaned_df.to_csv(
        path_obj.data_dir / "paprika" / fname,
        sep=" ",
        header=False,
        index=False
    )
    train_data.append(f"shape_data/paprika/{Path(fname).stem}")

  0%|          | 0/7387118 [00:00<?, ?it/s]

In [26]:
train_date = list(set(train_data))
random.shuffle(train_data)
len(train_data), len(test_data), len(val_data)

(4074399, 902673, 451336)

In [27]:
train_data[:3], test_data[:3], val_data[:3]

(['shape_data/paprika/Artega_0209_2_rot75_shear5_73',
  'shape_data/paprika/Nesditt_0309_3_rot150_shear10_312',
  'shape_data/paprika/Nesditt_0316_1_rot330_flip-x_shear40_355'],
 ['shape_data/paprika/Artega_0209_2_rot285_shear45_182',
  'shape_data/paprika/Trirosso_0309_1_rot135_shear50_446',
  'shape_data/paprika/Nagano_0316_1_rot330_flip-x_shear10_324'],
 ['shape_data/paprika/Nesditt_0316_1_rot45_shear10_88',
  'shape_data/paprika/Artega_0209_2_rot90_flip-xyz_shear35_194',
  'shape_data/paprika/Nesditt_0316_1_rot345_flip-xyz_shear10_398'])

In [28]:
split_dir = path_obj.data_dir / "train_test_split"
split_dir.mkdir(exist_ok=True, parents=False)
with open(split_dir / "shuffled_train_file_list.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=4)
with open(split_dir / "shuffled_test_file_list.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=4)
with open(split_dir / "shuffled_val_file_list.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, indent=4)

In [29]:
with open(path_obj.data_dir / "synsetoffset2category.txt", "w", encoding="utf-8") as f:
    f.write("paprika    paprika")

## Training/Testing

```bash
python3 rot_shr_flip_dichot.py -g <gpu id>
```

| epoch | Training acc | test acc | mIoU    | leaves IoU | non-leaves IoU |
| :---: | :----------: | :------: | :-----: | :--------: | :------------: |
| 10    | 0.88205      | 0.88663  | 0.73971 | 0.9799301  | 0.72474459     |
| 20    | 0.89357      | 0.90619  | 0.75366 | 0.953014   | 0.8249411      |
| 30    | 0.89529      | 0.897658 | 0.74596 | 0.9782382  | 0.7585358258   |


In [19]:
chk_path = path_obj.log_root / "part_seg" / path_obj.log_dir / "checkpoints"
for pt in chk_path.iterdir():
    chk = torch.load(pt)
    print(f"epoch: {chk['epoch']}")
    print(f"  train acc: {chk['train_acc']}")
    print(f"  test acc : {chk['test_acc']}")
    print(f"  mIoU     : {chk['instance_avg_iou']}")

epoch: 0
  train acc: 0.8952892342765766
  test acc : 0.8976525369608859
  mIoU     : 0.7453994983106933
epoch: 0
  train acc: 0.8935708145885936
  test acc : 0.9057473522388506
  mIoU     : 0.7529464878076988
epoch: 0
  train acc: 0.8820471640919566
  test acc : 0.8867728330798501
  mIoU     : 0.7396048878621354
epoch: 0
  train acc: 0.8952892342765766
  test acc : 0.8976525369608859
  mIoU     : 0.7453994983106933


In [None]:
# check gpu usage to set gpu_idx
!nvidia-smi

In [None]:
# gpu_idx = "6" # str

# seg_classes = {
#     "paprika": [0, 1],
#     # 0: non-leaves
#     # 1: leaves

#     # padding for 2:49
#     'Earphone'  : [16, 17, 18],
#     'Motorbike' : [30, 31, 32, 33, 34, 35],
#     'Rocket'    : [41, 42, 43],
#     'Car'       : [8, 9, 10, 11],
#     'Laptop'    : [28, 29],
#     'Cap'       : [6, 7],
#     'Skateboard': [44, 45, 46],
#     'Mug'       : [36, 37],
#     'Guitar'    : [19, 20, 21],
#     'Bag'       : [2, 3, 4, 5],
#     'Lamp'      : [24, 25, 26, 27],
#     'Table'     : [47, 48, 49],
#     'Pistol'    : [38, 39, 40],
#     'Chair'     : [12, 13, 14, 15],
#     'Knife'     : [22, 23]
# }

# train_args = {
#     # model params
#     "model"     : "pointnet2_part_seg_msg",
#     # "model"    : "pointnet2_part_seg_ssg",

#     # data params
#     "normal"    : True,
#     "log_root"  : path_obj.log_root,
#     "log_dir"   : path_obj.log_dir,
#     "data_dir"  : path_obj.data_dir,

#     # training params
#     "gpu"       : gpu_idx,
#     # "npoint"    : 2048,
#     # "batch_size": 16,
#     # "decay_rate": 1e-4,
#     # "step_size" : 20,
#     # "lr_decay"  : 0.5,
#     #"epoch"      : 500,
#     "epoch": 10
#     # "optimizer" : "Adam",
# }
# test_args = {
#     # data params
#     "normal"    : True,
#     "log_root"  : path_obj.log_root,
#     "log_dir"   : path_obj.log_dir,
#     "data_dir"  : path_obj.data_dir,

#     # testing params
#     "gpu"       : gpu_idx,
#     # "num_points": 2048,
#     # "batch_size": 24,
#     # "num_votes" : 3,
# }

In [None]:
# Might be a good idea to run the python script from the shell
!python3 rot_shr_flip_dichot.py -g 7

# met, iou, acc = run_pn(
#     gpu_idx,
#     path_obj,
#     seg_classes=seg_classes,
#     train_args=train_args,
#     test_args=test_args
# )
# pprint(f"test metrics: {met}")
# print(f"iou: {iou}")
# pprint(f"acc: {acc}")