# # 데이터 분할
---

In [91]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [172]:
df = pd.read_csv('/data/NIA48/raw/sync_cliplist_static-split-nas_20230421-123546.csv', encoding='utf-8', index_col=0)
df = df[['클립명', '수집일', 'weather', 'road_type', '최종선정객체', '프레임 구분']]
df

Unnamed: 0,클립명,수집일,weather,road_type,최종선정객체,프레임 구분
0,A_Clip_00073_11,2022-06-28,Fog,Highway,중앙분리대,100
1,A_Clip_00083_11,2022-06-28,Fog,Highway,중앙분리대,100
2,A_Clip_00084_11,2022-06-28,Fog,Highway,중앙분리대,100
3,A_Clip_00105_11,2022-06-28,Fog,Highway,중앙분리대,100
4,A_Clip_00109_11,2022-06-28,Fog,Highway,중앙분리대,100
...,...,...,...,...,...,...
18063,S_Clip_41456_17,2022-11-10,Clear,Route,표지판,34
18064,S_Clip_41486_17,2022-11-10,Clear,Route,표지판,100
18065,S_Clip_41503_17,2022-11-10,Clear,Route,표지판,65
18066,S_Clip_42487_17,2022-11-11,Clear,Expressway,표지판,31


In [190]:
# normal
df_n = df.loc[df['weather'] == 'Clear'].reset_index(drop=True)
df_ngr = df_n.set_index(['road_type', '수집일', '최종선정객체'])[['클립명']]


# abnormal
df_an = df.loc[df['weather'] != 'Clear'].reset_index(drop=True)
df_angr = df_an.set_index(['road_type', '수집일', '최종선정객체'])[['클립명']]

display(df_n.head(), df_an.head())

Unnamed: 0,클립명,수집일,weather,road_type,최종선정객체,프레임 구분
0,A_Clip_00373_11,2022-07-04,Clear,Highway,중앙분리대,100
1,A_Clip_00374_11,2022-07-04,Clear,Highway,중앙분리대,100
2,A_Clip_01496_11,2022-07-15,Clear,Expressway,중앙분리대,100
3,A_Clip_02904_11,2022-09-03,Clear,Highway,중앙분리대,100
4,A_Clip_02907_11,2022-09-03,Clear,Highway,중앙분리대,100


Unnamed: 0,클립명,수집일,weather,road_type,최종선정객체,프레임 구분
0,A_Clip_00073_11,2022-06-28,Fog,Highway,중앙분리대,100
1,A_Clip_00083_11,2022-06-28,Fog,Highway,중앙분리대,100
2,A_Clip_00084_11,2022-06-28,Fog,Highway,중앙분리대,100
3,A_Clip_00105_11,2022-06-28,Fog,Highway,중앙분리대,100
4,A_Clip_00109_11,2022-06-28,Fog,Highway,중앙분리대,100


In [201]:
# train, val, test split (normal)

train_ngr = []
val_ngr = []
test_ngr = []
for i in list(set(df_ngr.index)):
    scenes = df_ngr.loc[i]['클립명'].values

    if len(df_ngr.loc[i]) > 7:
        train_test, val = train_test_split(scenes, test_size=1/10, random_state=44)
        train, test = train_test_split(train_test, test_size=1/9, random_state=44)

        train_ngr.extend(train)
        val_ngr.extend(val)
        test_ngr.extend(test)

    else:
        train_ngr.extend(scenes)


with open('train_normal.txt', 'w') as f:
    f.write('\n'.join(sorted(train_ngr)))

with open('val_normal.txt', 'w') as f:
    f.write('\n'.join(sorted(val_ngr)))

with open('test_normal.txt', 'w') as f:
    f.write('\n'.join(sorted(test_ngr)))

len(train_ngr), len(val_ngr), len(test_ngr)

(6918, 874, 849)

In [202]:
# train, val, test split (abnormal)

train_angr = []
val_angr = []
test_angr = []
for i in list(set(df_angr.index)):
    scenes = df_angr.loc[i]['클립명'].values

    if len(df_angr.loc[i]) > 7:
        train_test, val = train_test_split(scenes, test_size=1/10, random_state=44)
        train, test = train_test_split(train_test, test_size=1/9, random_state=44)

        train_angr.extend(train)
        val_angr.extend(val)
        test_angr.extend(test)

    else:
        train_angr.extend(scenes)

with open('train_abnormal.txt', 'w') as f:
    f.write('\n'.join(sorted(train_angr)))

with open('val_abnormal.txt', 'w') as f:
    f.write('\n'.join(sorted(val_angr)))

with open('test_abnormal.txt', 'w') as f:
    f.write('\n'.join(sorted(test_angr)))

len(train_angr), len(val_angr), len(test_angr)

(7533, 955, 939)

In [194]:
# 프레임 개수 확인

train_num = 0
for i in train_angr:
    frame_num = df.loc[df['클립명'] == i, '프레임 구분'].values
    train_num += frame_num

val_num = 0
for i in val_angr:
    frame_num = df.loc[df['클립명'] == i, '프레임 구분'].values
    val_num += frame_num

test_num = 0
for i in test_angr:
    frame_num = df.loc[df['클립명'] == i, '프레임 구분'].values
    test_num += frame_num

train_num, val_num, test_num