# CCTV 교통 영상 데이터 분석 및 필요 파일

In [22]:
# OS : Windows-10-10.0.22621-SP0
# Python : 3.9.18
# Numpy : 1.26.0
# Pandas : 2.1.1
# Matplotlib : 3.7.1
# Seaborn : 0.12.2
# Created: NOV. 17. 2023
# Author: D.W. SHIN
# 교통문제 해결을 위한 CCTV 교통 영상(고속도로)의 데이터 분석

# !pip install koreanize_matplotlib





In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
import lxml
import os
import glob

from lxml import etree

In [5]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## 1. Validation 데이터 분석

In [6]:
val_base_dir = 'C:/ftp_base/datasets/Validation/바운딩박스'
os.chdir(val_base_dir)
os.getcwd()

'C:\\ftp_base\\datasets\\Validation\\바운딩박스'

In [7]:
# Validation의 하위폴더에서 모든 파일을 리스트로 만들기
val_file_list = glob.glob('./**', recursive=True)
val_file_name = [os.path.basename(x) for x  in val_file_list]

In [8]:
val_file_list = []
val_xml_list = []
for filename in val_file_name:
    if os.path.splitext(filename)[1] == '.png':
        val_file_list.append(filename)
    elif os.path.splitext(filename)[1] == '.xml':
        val_xml_list.append(filename)

In [9]:
print("png file cnt : ", len(val_file_list))
print("xml file cnt : ", len(val_xml_list))

png file cnt :  16224
xml file cnt :  51


### 1.1 원천 데이터 (png) 파일 분석

In [10]:
# 리스트로 받은 파일을 데이터프레임으로 변환
val_highway_raw_data = pd.DataFrame(val_file_list)

In [11]:
# 컬럼명 추가
val_highway_raw_data.columns = ['highway_raw_list']

In [12]:
val_highway_raw_data.head()

Unnamed: 0,highway_raw_list
0,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...
1,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...
2,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...
3,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...
4,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...


In [13]:
# 파일의 제목을 컬럼으로 변환
for i in range(val_highway_raw_data.shape[0]):
    val_highway_raw_data.loc[i, '지역'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[0]
    val_highway_raw_data.loc[i, '지점'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[1]
    val_highway_raw_data.loc[i, '날짜'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[2]
    val_highway_raw_data.loc[i, '시간'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[3]
    val_highway_raw_data.loc[i, '요일'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[4]
    val_highway_raw_data.loc[i, '카메라설치높이'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[5]
    val_highway_raw_data.loc[i, '정체여부'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[6]
    val_highway_raw_data.loc[i, '도로종류'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[7]
    val_highway_raw_data.loc[i, '차선정보'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[8]
    val_highway_raw_data.loc[i, '날씨'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[9]
    val_highway_raw_data.loc[i, '해상도'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[10]
    val_highway_raw_data.loc[i, '번호'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[11].split('.')[0]
    val_highway_raw_data.loc[i, 'cnt'] = 1

  val_highway_raw_data.loc[i, '지역'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[0]
  val_highway_raw_data.loc[i, '지점'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[1]
  val_highway_raw_data.loc[i, '날짜'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[2]
  val_highway_raw_data.loc[i, '시간'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[3]
  val_highway_raw_data.loc[i, '요일'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[4]
  val_highway_raw_data.loc[i, '카메라설치높이'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[5]
  val_highway_raw_data.loc[i, '정체여부'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[6]
  val_highway_raw_data.loc[i, '도로종류'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[7]
  val_highway_raw_data.loc[i, '차선정보'] = str(val_highway_raw_data.loc[i,'highway_raw_list']).split('_')[8]
  val_highway_raw_data.loc[i, '날씨'] = str(val_highway

In [14]:
val_highway_raw_data.head()

Unnamed: 0,highway_raw_list,지역,지점,날짜,시간,요일,카메라설치높이,정체여부,도로종류,차선정보,날씨,해상도,번호,cnt
0,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,1,1.0
1,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,2,1.0
2,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,3,1.0
3,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,4,1.0
4,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,5,1.0


In [15]:
# validation 데이터에서 unique를 확인함
print("지역 ", val_highway_raw_data['지역'].unique())
print("지점 ", val_highway_raw_data['지점'].unique())
print("날짜 ", val_highway_raw_data['날짜'].unique())
print("시간 ", val_highway_raw_data['시간'].unique())
print("요일 ", val_highway_raw_data['요일'].unique())
print("카메라설치높이 ", val_highway_raw_data['카메라설치높이'].unique())
print("정체여부 ", val_highway_raw_data['정체여부'].unique())
print("도로종류 ", val_highway_raw_data['도로종류'].unique())
print("차선정보 ", val_highway_raw_data['차선정보'].unique())
print("날씨 ", val_highway_raw_data['날씨'].unique())
print("해상도 ", val_highway_raw_data['해상도'].unique())

지역  ['Suwon']
지점  ['CH01' 'CH02' 'CH03' 'CH04' 'CH05' 'CH06' 'CH07' 'CH08' 'CH09' 'CH10']
날짜  ['20200720' '20200722' '20201012' '20201213' '20200721' '20201011'
 '20201010']
시간  ['1830' '1430' '1930' '1723' '1200' '2130' '2030' '1730' '1806' '0933'
 '2000' '1700' '1742' '1933' '1600' '1818' '1838' '1807' '1400' '1900'
 '1839' '1800' '1530' '1805' '1853' '1500' '1740' '1858' '1745' '1903'
 '0752']
요일  ['MON' 'WED' 'SUN' 'TUE' 'SAT']
카메라설치높이  ['9m']
정체여부  ['RH' 'NH']
도로종류  ['highway']
차선정보  ['TW5' 'OW5']
날씨  ['sunny' 'rainy' 'snow']
해상도  ['FHD']


In [16]:
print("총 파일수 : ", len(val_highway_raw_data))
print("\n")
print("각 지점별 파일수 \n", val_highway_raw_data.groupby('지점')['cnt'].sum())

총 파일수 :  16224


각 지점별 파일수 
 지점
CH01    1260.0
CH02    1358.0
CH03    1298.0
CH04    2450.0
CH05    1300.0
CH06    1156.0
CH07    2450.0
CH08    2442.0
CH09    1300.0
CH10    1210.0
Name: cnt, dtype: float64


In [17]:
# 지점 01 데이터를 확인함
val_highway_raw_data[val_highway_raw_data['지점'] == 'CH01']

Unnamed: 0,highway_raw_list,지역,지점,날짜,시간,요일,카메라설치높이,정체여부,도로종류,차선정보,날씨,해상도,번호,cnt
0,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,001,1.0
1,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,002,1.0
2,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,003,1.0
3,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,004,1.0
4,Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5...,Suwon,CH01,20200720,1830,MON,9m,RH,highway,TW5,sunny,FHD,005,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8737,Suwon_CH01_20201213_1200_SUN_9m_NH_highway_TW5...,Suwon,CH01,20201213,1200,SUN,9m,NH,highway,TW5,snow,FHD,295,1.0
8738,Suwon_CH01_20201213_1200_SUN_9m_NH_highway_TW5...,Suwon,CH01,20201213,1200,SUN,9m,NH,highway,TW5,snow,FHD,296,1.0
8739,Suwon_CH01_20201213_1200_SUN_9m_NH_highway_TW5...,Suwon,CH01,20201213,1200,SUN,9m,NH,highway,TW5,snow,FHD,297,1.0
8740,Suwon_CH01_20201213_1200_SUN_9m_NH_highway_TW5...,Suwon,CH01,20201213,1200,SUN,9m,NH,highway,TW5,snow,FHD,298,1.0


In [18]:
# 각 지점별 정체여부와 차선정보에 따른 날씨 데이터 갯수
highway_table = pd.pivot_table(
    data = val_highway_raw_data,
    index= '지점',
    columns=['정체여부', '차선정보', '날씨'],
    values='cnt',
    aggfunc=np.sum
)
highway_table.fillna(0, inplace=True)
highway_table.head(10)

  highway_table = pd.pivot_table(


정체여부,NH,NH,NH,NH,NH,NH,RH,RH
차선정보,OW5,OW5,OW5,TW5,TW5,TW5,OW5,TW5
날씨,rainy,snow,sunny,rainy,snow,sunny,sunny,sunny
지점,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
CH01,0.0,0.0,0.0,100.0,460.0,100.0,0.0,600.0
CH02,0.0,0.0,0.0,0.0,460.0,598.0,0.0,300.0
CH03,0.0,0.0,700.0,0.0,0.0,0.0,598.0,0.0
CH04,1250.0,0.0,300.0,0.0,0.0,0.0,900.0,0.0
CH05,100.0,0.0,900.0,0.0,0.0,0.0,300.0,0.0
CH06,100.0,0.0,756.0,0.0,0.0,0.0,300.0,0.0
CH07,1250.0,0.0,600.0,0.0,0.0,0.0,600.0,0.0
CH08,1250.0,0.0,600.0,0.0,0.0,0.0,592.0,0.0
CH09,100.0,0.0,600.0,0.0,0.0,0.0,600.0,0.0
CH10,0.0,448.0,0.0,100.0,0.0,662.0,0.0,0.0


### 1.2 결과
- 지점 01 ~ 04 사이에는 정체가 보통(NH)일때 눈이 오는 날씨가 없으므로 눈이 오는 데이터는 필요 없음
- 러시아워(RH) 일때는 날씨는 항상 좋음

### 1.3 라벨 데이터 (xml) 파일 분석

In [19]:
CLASSES = ["car", "bus", "truck"]

def to_yolov8(y):
  """
  # change to yolo v8 format
  # [x_top_left, y_top_left, x_bottom_right, y_bottom_right] to
  # [x_center, y_center, width, height]
  """
  width = y[2] - y[0]
  height = y[3] - y[1]

  if width < 0 or height < 0:
      print("ERROR: negative width or height ", width, height, y)
      raise AssertionError("Negative width or height")
  return (y[0] + (width/2)), (y[1] + (height/2)), width, height

In [20]:
# xml 파일을 읽어서 Yolo 타입으로 변환
def load_xml_annotations(f):
  tree = etree.parse(f)
  anns = []
  for dim in tree.xpath("image"):
    image_filename = dim.attrib["name"]
    width = int(dim.attrib["width"])
    height = int(dim.attrib["height"])
    # print(image_filename)
    # print(len(dim.xpath("box")))
    boxes = []
    for box in dim.xpath("box"):
      label = CLASSES.index(box.attrib["label"])
      xtl, ytl = box.attrib["xtl"], box.attrib["ytl"]
      xbr, ybr = box.attrib["xbr"], box.attrib["ybr"]

      xc, yc, w, h = to_yolov8([float(xtl), float(ytl), float(xbr), float(ybr)])
      boxes.append([label, round(xc/width, 5), round(yc/height, 5), round(w/width, 5), round(h/height, 5)])

    anns.append([image_filename, width, height, boxes])

  return np.array(anns, dtype="object")

In [21]:
# get car type
# 자동차 타입을 확인한다.
# 예) car bus truck 등

def get_car_type(f):
  tree = etree.parse(f)
  car_type = []
  for meta_tag in tree.xpath("meta"):
    for task_tag in meta_tag.xpath("task"):
      for lables_tag in task_tag.xpath("labels"):
        for lable_tag in lables_tag.xpath("label"):
          for name_tag in lable_tag.xpath("name"):            
            car_type.append(name_tag.text)
  result = []
  truck_cnt = 0
  bus_cnt = 0
  car_cnt = 0
  for dim in tree.xpath("image"):
    for box in dim.xpath("box"):
      cars = box.attrib["label"]
      if cars == car_type[0]:
        truck_cnt = truck_cnt + 1
      elif cars == car_type[1]:
        bus_cnt = bus_cnt + 1
      elif cars == car_type[2]:
        car_cnt = car_cnt + 1
   
#  print("truck_cnt : ", truck_cnt)
#  print("bus_cnt : ", bus_cnt)
#  print("car_cnt : ", car_cnt)

  return np.array([[car_type[0] , truck_cnt] , [car_type[1] , bus_cnt] , [car_type[2] , car_cnt]])

In [22]:
# xml 파일을 실제로 txt 파일로 쓰기
def write_yolov8_txt(folder, annotation):
  #print(annotation[0][:-3])
  out_filename = os.path.join(folder,str(annotation[0][:-3]))
  out_filename = os.path.splitext(out_filename)[0]
  out_filename = out_filename+'.txt'

  f = open(out_filename,"w+")
  for box in annotation[3]:
    f.write("{} {} {} {} {}\n".format(box[0], box[1], box[2], box[3], box[4]))

In [23]:
# input : 베이스 파일명, 파일리스트
# output: 파일명 리스트 , 파일 경로 리스트, XML명 리스트, XML 리스트 반환

def get_file_n_xml_list(base_dir, file_list):
  file_name_list = []
  xml_name_list = []
  file_path_list = []
  xml_path_list = []

  for files in file_list:
    file_name = os.path.basename(files)    
    if os.path.splitext(file_name)[1] == '.png':
      file_name_list.append(file_name)
      under_file_path = files
      under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
      path_list = base_dir + under_file_path
      file_path_list.append(path_list)
    elif os.path.splitext(file_name)[1] == '.xml':
      xml_name_list.append(file_name)
      under_file_path = files
      under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
      path_list = base_dir + under_file_path
      xml_path_list.append(path_list)
  

  return np.array([file_name_list, file_path_list, xml_name_list, xml_path_list], dtype="object")

In [24]:
# 특정 지점의 파일명, 파일리스트, XML명, XML 리스트 반환
# input : 베이스 파일명, 파일리스트, 지점, 정체여부, 차선정보, 날씨
# output: 파일명 리스트 , 파일 경로 리스트, XML명 리스트, XML 리스트 반환

def get_specific_file_n_xml_list(base_dir, file_list, bra, cong, lane, wea):
  file_name_list = []
  xml_name_list = []
  file_path_list = []
  xml_path_list = []

  for files in file_list:
    file_name = os.path.basename(files)
    if file_name.find('_') != -1 and len(file_name.split('_')) > 9:
        branch = file_name.split('_')[1]
        congestion = file_name.split('_')[6]
        lane_info = file_name.split('_')[8]
        weather = file_name.split('_')[9]
        ## 필요한 지점, 정체여부, 차선정보, 날씨로 xml 파일리스트를 확인한다.
        if (bra == branch) and (cong == congestion) and (lane == lane_info) and (wea == weather):
            if os.path.splitext(file_name)[1] == '.png':
                file_name_list.append(file_name)
                under_file_path = files
                under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
                path_list = base_dir + under_file_path
                file_path_list.append(path_list)
            elif os.path.splitext(file_name)[1] == '.xml':
                xml_name_list.append(file_name)
                under_file_path = files
                under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
                path_list = base_dir + under_file_path
                xml_path_list.append(path_list)
  
  return np.array([file_name_list, file_path_list, xml_name_list, xml_path_list], dtype="object")

In [50]:
# 특정 지점으로만 아래의 내용을 반환한다.
# 특정 지점의 파일명, 파일리스트, XML명, XML 리스트 반환
# input : 베이스 파일명, 파일리스트, 지점
# output: 파일명 리스트 , 파일 경로 리스트, XML명 리스트, XML 리스트 반환

def get_specific_file_n_xml_list_v2(base_dir, file_list, bra):
  file_name_list = []
  xml_name_list = []
  file_path_list = []
  xml_path_list = []

  for files in file_list:
    file_name = os.path.basename(files)
    if file_name.find('_') != -1 and len(file_name.split('_')) > 9:
        branch = file_name.split('_')[1]
        ## 필요한 지점, 정체여부, 차선정보, 날씨로 xml 파일리스트를 확인한다.
        if bra == branch:
            if os.path.splitext(file_name)[1] == '.png':
                file_name_list.append(file_name)
                under_file_path = files
                under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
                path_list = base_dir + under_file_path
                file_path_list.append(path_list)
            elif os.path.splitext(file_name)[1] == '.xml':
                xml_name_list.append(file_name)
                under_file_path = files
                under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
                path_list = base_dir + under_file_path
                xml_path_list.append(path_list)
  
  return np.array([file_name_list, file_path_list, xml_name_list, xml_path_list], dtype="object")

### 1.3.1 생성된 함수를 실행

In [25]:
## 위쪽에서 셀을 실행 안했을 경우를 대비 다시 한번 불러 온다.
val_base_dir = 'C:/ftp_base/datasets/Validation/바운딩박스'
os.chdir(val_base_dir)
os.getcwd()
# Validation의 하위폴더에서 모든 파일을 리스트로 만들기
val_file_list = glob.glob('./**', recursive=True)

In [26]:
## xml 파일 명과 리스트를 가져온다.
## 특정 지점의 파일 리스트를 가져 온다.
## 아래의 함수와 동일함
## %%%%%%%%%%%%%%% get_file_n_xml_list %%%%%%%%%%%%%%%%%%%%%%
## local 테스트를 위해 작성된 내용임

val_file_name_list = []
val_xml_name_list = []
val_file_path_list = []
val_xml_path_list = []
val_xml_path_list2 = []

for files in val_file_list:
  file_name = os.path.basename(files)
  # print("file_name", file_name)
  if file_name.find('_') != -1 and len(file_name.split('_')) > 9:
    # print("len : ", len(file_name.split('_')))
    branch = file_name.split('_')[1]
    congestion = file_name.split('_')[6]
    lane_info = file_name.split('_')[8]
    weather = file_name.split('_')[9]
    # print("branch", branch)
    # print("congestion", congestion)
    # print("lane_info", lane_info)
    # print("weather", weather)
    ## 필요한 지점, 정체여부, 차선정보, 날씨로 xml 파일리스트를 확인한다.
    if ("CH10" == branch) and ("NH" == congestion) and ("TW5" == lane_info) and ("rainy" == weather):
      if os.path.splitext(file_name)[1] == '.xml':
        under_file_path = files
        under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
        path_list = val_base_dir + under_file_path
        val_xml_path_list2.append(path_list)
    
  if os.path.splitext(file_name)[1] == '.png':
    val_file_name_list.append(file_name)
    under_file_path = files
    under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
    path_list = val_base_dir + under_file_path
    val_file_path_list.append(path_list)
  elif os.path.splitext(file_name)[1] == '.xml':
    val_xml_name_list.append(file_name)
    under_file_path = files
    under_file_path = under_file_path.replace(".\\", "/").replace("\\", "/").replace("./", "/")
    path_list = val_base_dir + under_file_path
    val_xml_path_list.append(path_list)

In [27]:
val_xml_path_list2

['C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH10_20200722_1500_WED_9m_NH_highway_TW5_rainy_FHD.xml']

In [28]:
# xml 경로 리스트를 for문에 넣어서 차량의 숫자를 확인한다.
# 테스트 시 val_xml_path_list, val_xml_path_list2로 번갈아서 확인 할것 !
# 각 폴더의 차, 트럭, 버스의 수를 반환
total_truck = 0
total_bus = 0
total_car = 0
for label_file in val_xml_path_list:
  carType1, carType2, carType3 = get_car_type(label_file)
  total_truck = total_truck + int(carType1[1])
  total_bus = total_bus + int(carType2[1])
  total_car = total_car + int(carType3[1])

print("Validation 안의 차량 수")
print("\n")
print("total_truck :", total_truck)
print("total_bus :", total_bus)
print("total_car :", total_car)

Validation 안의 차량 수


total_truck : 30241
total_bus : 49060
total_car : 99640


In [29]:
# 함수를 사용해서 xml 리스트 가져오기
# 함수명 : get_file_n_xml_list
file_name_list, file_path_list, xml_name_list, xml_path_list = get_file_n_xml_list(val_base_dir, val_file_list)

In [139]:
xml_path_list

['C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20200721_1700_TUE_9m_RH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20200722_1430_WED_9m_NH_highway_TW5_rainy_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20200722_1930_WED_9m_NH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20201012_1723_MON_9m_RH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20201213_1200_SUN_9m_NH_highway_TW5_snow_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH02_20200720_2130_MON_9m_NH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH02_20200721_2030_TUE_9m_NH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH02_20200722_

In [53]:
# 특정 지점, 정체여부, 차선정보, 날씨로 xml 파일리스트를 확인한다
BRANCH = "CH10"
CONGESTION = "NH"
LANE_INFO = "TW5"
WEATHER = "rainy"

file_name_list2, file_path_list2, xml_name_list2, xml_path_list2 = get_specific_file_n_xml_list(val_base_dir, val_file_list, BRANCH, CONGESTION, LANE_INFO, WEATHER)

In [51]:
# 특정 지점, 정체여부, 차선정보, 날씨로 xml 파일리스트를 확인한다
BRANCH = "CH10"

file_name_list3, file_path_list3, xml_name_list3, xml_path_list3 = get_specific_file_n_xml_list_v2(val_base_dir, val_file_list, BRANCH)

In [54]:
# 테스트 시 xml_path_list, xml_path_list2로 번갈아서 확인 할것 !
# 각 폴더의 차, 트럭, 버스의 수를 반환
total_truck = 0
total_bus = 0
total_car = 0
for label_file in xml_path_list2:
  carType1, carType2, carType3 = get_car_type(label_file)
  total_truck = total_truck + int(carType1[1])
  total_bus = total_bus + int(carType2[1])
  total_car = total_car + int(carType3[1])

print("Validation 안의 차량 수")
print("\n")
print("total_truck :", total_truck)
print("total_bus :", total_bus)
print("total_car :", total_car)

Validation 안의 차량 수


total_truck : 2422
total_bus : 1914
total_car : 4879


In [46]:
# XML -> TXT로 변환
# time.sleep(1) 한 이유: 폴더 생성이 되기도 전에 파일이 써지는 현상이 있음
import time

for label_file in val_xml_path_list:
    anns = load_xml_annotations(label_file)
    # print(anns)
    label_files = os.path.split(label_file)
    folderName=os.path.splitext(label_files[1])[0]
    os.makedirs(os.path.join(label_files[0],folderName), exist_ok=True)
    time.sleep(1)
    for ann in anns:
        write_yolov8_txt(os.path.join(label_files[0],folderName), ann)

### 1.4 xml 결과
- 아래의 폴더에 TXT 파일이 생성됨
- C:\ftp_base\datasets\Validation\바운딩박스\[라벨]1.수도권영동선

### 1.5 커스텀 함수들
- get_car_type
- get_file_n_xml_list
- get_specific_file_n_xml_list
- get_specific_file_n_xml_list_v2
- load_xml_annotations_without_car


In [32]:
# load_xml_annotations 할때 버스 혹은 트럭만 가져오기

def load_xml_annotations_without_car(f):
  tree = etree.parse(f)
  anns = []
  
  truck_cnt = 0
  bus_cnt = 0
  car_cnt = 0
  
  
  for dim in tree.xpath("image"):
    image_filename = dim.attrib["name"]
    width = int(dim.attrib["width"])
    height = int(dim.attrib["height"])
    # print(image_filename)
    # print(len(dim.xpath("box")))
    boxes = []
    
    for box in dim.xpath("box"):
      cars = box.attrib["label"]
      
      if cars == 'car':
        car_cnt = car_cnt + 1
      
      if cars != 'car':
        label = CLASSES.index(box.attrib["label"])
        xtl, ytl = box.attrib["xtl"], box.attrib["ytl"]
        xbr, ybr = box.attrib["xbr"], box.attrib["ybr"]

        xc, yc, w, h = to_yolov8([float(xtl), float(ytl), float(xbr), float(ybr)])
        boxes.append([label, round(xc/width, 5), round(yc/height, 5), round(w/width, 5), round(h/height, 5)])
        
        if cars == 'truck':
          truck_cnt = truck_cnt + 1
        elif cars == 'bus':
          bus_cnt = bus_cnt + 1

    anns.append([image_filename, width, height, boxes])
  
  # print("truck_cnt : ", truck_cnt)
  # print("bus_cnt : ", bus_cnt)
  # print("car_cnt : ", car_cnt)
  
  # return np.array([["truck_cnt" , truck_cnt] , ["bus_cnt" , bus_cnt] , ["car_cnt" , car_cnt]])
  
  return np.array(anns, dtype="object")



In [47]:
val_xml_path_list

['C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20200721_1700_TUE_9m_RH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20200722_1430_WED_9m_NH_highway_TW5_rainy_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20200722_1930_WED_9m_NH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20201012_1723_MON_9m_RH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH01_20201213_1200_SUN_9m_NH_highway_TW5_snow_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH02_20200720_2130_MON_9m_NH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH02_20200721_2030_TUE_9m_NH_highway_TW5_sunny_FHD.xml',
 'C:/ftp_base/datasets/Validation/바운딩박스/[라벨]1.수도권영동선/Suwon_CH02_20200722_

In [54]:
# # load_xml_annotations_without_car 테스트
# total_truck = 0
# total_bus = 0
# total_car = 0
# for label_file in val_xml_path_list:
#   carType1, carType2, carType3 = load_xml_annotations_without_car(label_file)
#   total_truck = total_truck + int(carType1[1])
#   total_bus = total_bus + int(carType2[1])
#   total_car = total_car + int(carType3[1])

# print("Validation 안의 차량 수")
# print("\n")
# print("total_truck :", total_truck)
# print("total_bus :", total_bus)
# print("total_car :", total_car)

truck_cnt :  1083
bus_cnt :  320
car_cnt :  3044
truck_cnt :  1567
bus_cnt :  138
car_cnt :  2740
truck_cnt :  574
bus_cnt :  24
car_cnt :  712
truck_cnt :  267
bus_cnt :  64
car_cnt :  742
truck_cnt :  729
bus_cnt :  31
car_cnt :  2148
truck_cnt :  290
bus_cnt :  56
car_cnt :  2396
truck_cnt :  383
bus_cnt :  52
car_cnt :  1381
truck_cnt :  189
bus_cnt :  34
car_cnt :  1047
truck_cnt :  232
bus_cnt :  26
car_cnt :  788
truck_cnt :  126
bus_cnt :  54
car_cnt :  1986
truck_cnt :  276
bus_cnt :  82
car_cnt :  2014
truck_cnt :  234
bus_cnt :  25
car_cnt :  757
truck_cnt :  219
bus_cnt :  52
car_cnt :  739
truck_cnt :  354
bus_cnt :  38
car_cnt :  1360
truck_cnt :  113
bus_cnt :  56
car_cnt :  1851
truck_cnt :  616
bus_cnt :  160
car_cnt :  2026
truck_cnt :  241
bus_cnt :  51
car_cnt :  940
truck_cnt :  480
bus_cnt :  256
car_cnt :  3033
truck_cnt :  1901
bus_cnt :  160
car_cnt :  4648
truck_cnt :  50
bus_cnt :  50
car_cnt :  1776
truck_cnt :  466
bus_cnt :  140
car_cnt :  2634
truck_cnt :

In [58]:
# 커스텀 함수로 버스 트럭만 바운딩 박스를 만들때 사용하는 내용임
# truck, bus 만 txt 파일로 변환
import time

for label_file in val_xml_path_list:
    anns = load_xml_annotations_without_car(label_file)
    # print(anns)
    label_files = os.path.split(label_file)
    folderName=os.path.splitext(label_files[1])[0]
    os.makedirs(os.path.join(label_files[0],folderName), exist_ok=True)
    time.sleep(1)
    for ann in anns:
        print("folderName", folderName)
        write_yolov8_txt(os.path.join(label_files[0],folderName), ann)

folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_highway_TW5_sunny_FHD
folderName Suwon_CH01_20200720_1830_MON_9m_RH_hi

## 2. Training 데이터 분석

In [33]:
train_base_dir = 'C:/ftp_base/datasets/Training/바운딩박스'
os.chdir(train_base_dir)
os.getcwd()

'C:\\ftp_base\\datasets\\Training\\바운딩박스'

In [34]:
# Training의 하위폴더에서 모든 파일을 리스트로 만들기
train_file_list = glob.glob('./**', recursive=True)
train_file_name = [os.path.basename(x) for x  in train_file_list]

In [35]:
train_file_list = []
for filename in train_file_name:
    if os.path.splitext(filename)[1] == '.png':
        train_file_list.append(filename)

In [36]:
train_highway_raw_data = pd.DataFrame(train_file_list)
train_highway_raw_data.columns = ['highway_raw_list']

### 2.1 원천 데이터 (png) 파일 분석

In [37]:
train_highway_raw_data.head()

Unnamed: 0,highway_raw_list
0,Suwon_CH01_20200720_1700_MON_9m_NH_highway_TW5...
1,Suwon_CH01_20200720_1700_MON_9m_NH_highway_TW5...
2,Suwon_CH01_20200720_1700_MON_9m_NH_highway_TW5...
3,Suwon_CH01_20200720_1700_MON_9m_NH_highway_TW5...
4,Suwon_CH01_20200720_1700_MON_9m_NH_highway_TW5...


In [38]:
# 파일의 제목을 컬럼으로 변환
for i in range(train_highway_raw_data.shape[0]):
    train_highway_raw_data.loc[i, '지역'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[0]
    train_highway_raw_data.loc[i, '지점'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[1]
    train_highway_raw_data.loc[i, '날짜'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[2]
    train_highway_raw_data.loc[i, '시간'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[3]
    train_highway_raw_data.loc[i, '요일'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[4]
    train_highway_raw_data.loc[i, '카메라설치높이'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[5]
    train_highway_raw_data.loc[i, '정체여부'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[6]
    train_highway_raw_data.loc[i, '도로종류'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[7]
    train_highway_raw_data.loc[i, '차선정보'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[8]
    train_highway_raw_data.loc[i, '날씨'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[9]
    train_highway_raw_data.loc[i, '해상도'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[10]
    train_highway_raw_data.loc[i, '번호'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[11].split('.')[0]
    train_highway_raw_data.loc[i, 'cnt'] = 1

  train_highway_raw_data.loc[i, '지역'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[0]
  train_highway_raw_data.loc[i, '지점'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[1]
  train_highway_raw_data.loc[i, '날짜'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[2]
  train_highway_raw_data.loc[i, '시간'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[3]
  train_highway_raw_data.loc[i, '요일'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[4]
  train_highway_raw_data.loc[i, '카메라설치높이'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[5]
  train_highway_raw_data.loc[i, '정체여부'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[6]
  train_highway_raw_data.loc[i, '도로종류'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[7]
  train_highway_raw_data.loc[i, '차선정보'] = str(train_highway_raw_data.loc[i,'highway_raw_list']).split('_')[8]
  train_highway_r

In [39]:
# validation 데이터에서 unique를 확인함
print("지역 ", train_highway_raw_data['지역'].unique())
print("지점 ", train_highway_raw_data['지점'].unique())
print("날짜 ", train_highway_raw_data['날짜'].unique())
print("시간 ", train_highway_raw_data['시간'].unique())
print("요일 ", train_highway_raw_data['요일'].unique())
print("카메라설치높이 ", train_highway_raw_data['카메라설치높이'].unique())
print("정체여부 ", train_highway_raw_data['정체여부'].unique())
print("도로종류 ", train_highway_raw_data['도로종류'].unique())
print("차선정보 ", train_highway_raw_data['차선정보'].unique())
print("날씨 ", train_highway_raw_data['날씨'].unique())
print("해상도 ", train_highway_raw_data['해상도'].unique())

지역  ['Suwon']
지점  ['CH01' 'CH02' 'CH03' 'CH04' 'CH05' 'CH06' 'CH07' 'CH08' 'CH09' 'CH10']
날짜  ['20200720' '20200721' '20200722' '20201010' '20201011' '20201012'
 '20201213']
시간  ['1700' '1730' '1800' '1900' '1930' '2000' '2030' '1500' '1530' '1600'
 '1630' '2100' '1330' '1400' '1830' '1653' '1753' '1823' '1853' '1923'
 '0700' '0730' '0930' '1030' '2130' '1430' '1807' '1837' '1908' '1736'
 '1836' '1906' '1657' '1728' '1828' '1858' '1928' '0732' '1033' '1204'
 '1742' '1812' '1842' '1913' '1843' '1943' '1732' '1802' '1833' '1903'
 '0704' '0835' '0936' '1137' '1717' '1748' '1849' '1818' '1919' '1706'
 '1737' '1939' '0737' '0838' '1142' '1824' '1855' '1723' '1838' '1909'
 '0709' '0943' '1044' '1216' '1759' '1901' '1808' '1910' '1941' '0742'
 '0946' '1734' '1907' '1805' '1720' '1751' '1822' '1924' '0714' '0745'
 '0949' '1224' '1811' '1740' '1724' '1755' '1827' '1929' '0747' '0952'
 '1054' '1714' '1817' '1848' '1745' '1920' '1729' '1832' '1934' '0718'
 '0853' '0955' '1232' '1854' '1926' '0855

In [40]:
print("총 파일수 : ", len(train_highway_raw_data))
print("\n")
print("각 지점별 파일수 \n", train_highway_raw_data.groupby('지점')['cnt'].sum())

총 파일수 :  56663


각 지점별 파일수 
 지점
CH01    6528.0
CH02    5275.0
CH03    6568.0
CH04    5580.0
CH05    5879.0
CH06    5857.0
CH07    5766.0
CH08    5414.0
CH09    5333.0
CH10    4463.0
Name: cnt, dtype: float64


In [41]:
# 각 지점별 정체여부와 차선정보에 따른 날씨 데이터 갯수
highway_table = pd.pivot_table(
    data = train_highway_raw_data,
    index= '지점',
    columns=['정체여부', '차선정보', '날씨'],
    values='cnt',
    aggfunc=np.sum
)
highway_table.fillna(0, inplace=True)
highway_table.head(10)

  highway_table = pd.pivot_table(


정체여부,NH,NH,NH,NH,NH,NH,RH,RH,RH
차선정보,OW5,OW5,OW5,TW5,TW5,TW5,OW5,TW5,TW5
날씨,rainy,snow,sunny,rainy,snow,sunny,sunny,rainy,sunny
지점,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
CH01,0.0,0.0,0.0,1350.0,881.0,1649.0,0.0,100.0,2548.0
CH02,0.0,0.0,0.0,150.0,676.0,2099.0,0.0,200.0,2150.0
CH03,1300.0,823.0,2447.0,0.0,0.0,0.0,1998.0,0.0,0.0
CH04,874.0,607.0,2300.0,0.0,0.0,0.0,1799.0,0.0,0.0
CH05,775.0,859.0,2747.0,0.0,0.0,0.0,1498.0,0.0,0.0
CH06,823.0,387.0,3165.0,0.0,0.0,0.0,1482.0,0.0,0.0
CH07,924.0,865.0,2778.0,0.0,0.0,0.0,1199.0,0.0,0.0
CH08,875.0,645.0,2847.0,0.0,0.0,0.0,1047.0,0.0,0.0
CH09,150.0,886.0,2950.0,0.0,0.0,0.0,1347.0,0.0,0.0
CH10,0.0,446.0,600.0,323.0,0.0,2496.0,598.0,0.0,0.0


### 2.2 결과
- train 데이터와 val 데이터의 분포가 비슷한다.
- 다만 NH일 경우에 차선정보가 OW5에서는 snow 정보가 있는것이 다르다!

### 2.3 라벨 데이터 (xml) 파일 분석

### 2.3.1 생성된 함수를 실행
- 함수는 위에서 정의했음

In [42]:
## 위쪽에서 셀을 실행 안했을 경우를 대비 다시 한번 불러 온다.
train_base_dir = 'C:/ftp_base/datasets/Training/바운딩박스'
os.chdir(train_base_dir)
os.getcwd()
# Training의 하위폴더에서 모든 파일을 리스트로 만들기
train_file_list = glob.glob('./**', recursive=True)

In [43]:
# xml 파일 가져오기
file_name_list, file_path_list, xml_name_list, xml_path_list = get_file_n_xml_list(train_base_dir, train_file_list)

In [48]:
# 특정 지점, 정체여부, 차선정보, 날씨로 xml 파일리스트를 확인한다
BRANCH = "CH10"
CONGESTION = "NH"
LANE_INFO = "TW5"
WEATHER = "sunny"

file_name_list2, file_path_list2, xml_name_list2, xml_path_list2 = get_specific_file_n_xml_list(train_base_dir, train_file_list, BRANCH, CONGESTION, LANE_INFO, WEATHER)

In [49]:
# 각 폴더의 차, 트럭, 버스의 수를 반환
total_truck = 0
total_bus = 0
total_car = 0
for label_file in xml_path_list2:
  carType1, carType2, carType3 = get_car_type(label_file)
  total_truck = total_truck + int(carType1[1])
  total_bus = total_bus + int(carType2[1])
  total_car = total_car + int(carType3[1])

print("Validation 안의 차량 수")
print("\n")
print("total_truck :", total_truck)
print("total_bus :", total_bus)
print("total_car :", total_car)

Validation 안의 차량 수


total_truck : 8787
total_bus : 17072
total_car : 21942
