# 좌표계 변환코드

In [2]:
import pandas as pd
import numpy as np
import os
import chardet
from pyproj import CRS, Transformer

In [14]:
# Detect encoding
with open('./gps_data/entrc_sejong.txt', 'rb') as f:
    result = chardet.detect(f.read())
encoding = result['encoding']
print(f"The file encoding is: {encoding}")

The file encoding is: EUC-KR


In [15]:
col_name = ['SIGUNGU_CD', 
            'ENT_NO', 
            'LAWDONG_CD', 
            'SIDO_NM', 
            'SIGUNGU_NM', 
            'EUPMYUNDONG_NM', 
            'DORO_CD', 
            'DORO_NM', 
            'UNDER_YN', 
            'BUILDING_NO', 
            'BUILDING_SUB_NO', 
            'BUILDING_NM', 
            'POST_NO', 
            'BUILDING_KIND', 
            'BUILDING_GROUP_YN', 
            'HAENGJUNGDONG_NM', 
            'X_LONG', 
            'Y_LAN']

In [211]:
df = pd.read_csv('./gps_data/entrc_sejong.txt', encoding = 'EUC-KR', names=col_name, low_memory=False, sep='|')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27522 entries, 0 to 27521
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SIGUNGU_CD         27522 non-null  int64  
 1   ENT_NO             27522 non-null  int64  
 2   LAWDONG_CD         27522 non-null  int64  
 3   SIDO_NM            27522 non-null  object 
 4   SIGUNGU_NM         0 non-null      float64
 5   EUPMYUNDONG_NM     27522 non-null  object 
 6   DORO_CD            27522 non-null  int64  
 7   DORO_NM            27522 non-null  object 
 8   UNDER_YN           27522 non-null  int64  
 9   BUILDING_NO        27522 non-null  int64  
 10  BUILDING_SUB_NO    27522 non-null  int64  
 11  BUILDING_NM        1918 non-null   object 
 12  POST_NO            27522 non-null  int64  
 13  BUILDING_KIND      27514 non-null  object 
 14  BUILDING_GROUP_YN  27522 non-null  int64  
 15  HAENGJUNGDONG_NM   27522 non-null  object 
 16  X_LONG             274

# 건물용도에 따른 샘플링

In [27]:
array = df.BUILDING_KIND.unique()
# NaN 값을 제외합니다.
array = array[pd.notnull(array)]

# 각 요소를 단어로 분리하여 이중 리스트 생성
word_list = [item.split(',') for item in array]

# 모든 단어를 하나의 리스트로 합치기
words = [word.strip() for sublist in word_list for word in sublist]
set(words)

{'공공용시설',
 '공장/창고시설',
 '교육및복지시설',
 '근린생활시설',
 '농축수산시설',
 '문화/관광/레저시설',
 '보안/위험시설',
 '숙박시설',
 '업무시설',
 '여객(화물)운송시설',
 '유통시설',
 '유흥/위락시설',
 '의료시설',
 '자동차관련시설',
 '장묘시설',
 '종교시설',
 '주택',
 '환경정화시설'}

In [254]:
df_filter_con = ['주택']  # only these in BUILDING_KIND
# 키워드를 연결하여 정규 표현식 패턴을 생성합니다.
pattern = '|'.join(df_filter_con)
type(df_filter_con), df_filter_con, pattern

(list, ['주택'], '주택')

In [255]:
# df에서 주소만 남긴다
def df_filter(df):
    df_filtered = df[df['BUILDING_KIND'].isin(df_filter_con)]
    return df_filtered

In [None]:
df_filter(df)

# gps좌표계 변환

In [106]:
#proj 세팅
# CRS 및 Transformer 설정 # grs80 utm-k 좌표계 # 경위도좌표  ## 문의게시판 tech&tips  2번
grs_fr = CRS('EPSG:5179')
wgs_to = CRS('EPSG:4326')
transformer = Transformer.from_crs(grs_fr, wgs_to, always_xy=True)

In [143]:
# 좌표 변환 함수 정의
def convert_coordinates(row):
    lon1, lat1 = transformer.transform(row['X_LONG'], row['Y_LAN'])
    
        # -0.1도에서 0.1도 사이의 랜덤 노이즈 생성
    noise_lon = np.random.uniform(-0.1, 0.1)
    noise_lat = np.random.uniform(-0.1, 0.1)
    
    # 노이즈를 좌표에 추가
    lon_noisy = lon1 + noise_lon
    lat_noisy = lat1 + noise_lat

    return np.array([lon_noisy, lat_noisy])

In [None]:
# 각 행에 대해 함수 적용
coordinates = df_filtered.apply(convert_coordinates, axis=1)

# 결과를 2차원 numpy 배열로 변환
coordinates_array = np.vstack(coordinates.values)
df_filtered.loc[:, ['GPS_LON_X', 'GPS_LAT_Y']] = coordinates_array
df_filtered.head(10).T

In [210]:
def add_transformed_coordinates(df):
    """
    각 행에 대해 convert_coordinates 함수를 적용하고,
    변환된 좌표를 데이터프레임의 새로운 컬럼으로 추가합니다.

    매개변수:
    df (pd.DataFrame): 'X_LONG'와 'Y_LAN' 컬럼을 포함하는 입력 데이터프레임

    반환값:
    pd.DataFrame: 'GPS_LON_X'와 'GPS_LAT_Y' 컬럼이 추가된 데이터프레임
    """
    # 각 행에 대해 함수 적용
    coordinates = df.apply(convert_coordinates, axis=1)
    
    # 결과를 2차원 numpy 배열로 변환
    coordinates_array = np.vstack(coordinates.values)
    
    # 변환된 좌표를 새로운 컬럼으로 추가
    df_filtered = df.copy()
    df_filtered.loc[:, ['GPS_LON_X', 'GPS_LAT_Y']] = coordinates_array
    
    return df_filtered

In [220]:
df_filtered = add_transformed_coordinates(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27522 entries, 0 to 27521
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SIGUNGU_CD         27522 non-null  int64  
 1   ENT_NO             27522 non-null  int64  
 2   LAWDONG_CD         27522 non-null  int64  
 3   SIDO_NM            27522 non-null  object 
 4   SIGUNGU_NM         0 non-null      float64
 5   EUPMYUNDONG_NM     27522 non-null  object 
 6   DORO_CD            27522 non-null  int64  
 7   DORO_NM            27522 non-null  object 
 8   UNDER_YN           27522 non-null  int64  
 9   BUILDING_NO        27522 non-null  int64  
 10  BUILDING_SUB_NO    27522 non-null  int64  
 11  BUILDING_NM        1918 non-null   object 
 12  POST_NO            27522 non-null  int64  
 13  BUILDING_KIND      27514 non-null  object 
 14  BUILDING_GROUP_YN  27522 non-null  int64  
 15  HAENGJUNGDONG_NM   27522 non-null  object 
 16  X_LONG             274

In [225]:
#파일리스트 가져오기
folder_path = './gps_data'
file_list = os.listdir(folder_path)
len(file_list)

17

In [256]:
for file in file_list:
    
    print(f'start: {file}', sep='\r')
    # Detect encoding
    #with open('./gps_data/entrc_sejong.txt', 'rb') as f:
    #    result = chardet.detect(f.read())
    #encoding = result['encoding']

    #print(f"The file encoding is: {encoding}")

    df = pd.read_csv(f'./gps_data/{file}', 
                     encoding = 'EUC-KR', names=col_name, low_memory=False, sep='|', encoding_errors='ignore')
    #df.info()

    df_filtered = df_filter(df)

    df_filtered = add_transformed_coordinates(df_filtered)

    file_name = df_filtered.SIDO_NM.head(1).item()

    df_filtered.to_csv(f'./gps_data_add/{file_name}.csv', encoding='utf-8', index=False)
    print(f'done! : {file}', sep='\r')

start: entrc_busan.txt
done! : entrc_busan.txt
start: entrc_chungbuk.txt
done! : entrc_chungbuk.txt
start: entrc_chungnam.txt
done! : entrc_chungnam.txt
start: entrc_daegu.txt
done! : entrc_daegu.txt
start: entrc_daejeon.txt
done! : entrc_daejeon.txt
start: entrc_gangwon.txt
done! : entrc_gangwon.txt
start: entrc_gwangju.txt
done! : entrc_gwangju.txt
start: entrc_gyeongbuk.txt
done! : entrc_gyeongbuk.txt
start: entrc_gyeongnam.txt
done! : entrc_gyeongnam.txt
start: entrc_gyunggi.txt
done! : entrc_gyunggi.txt
start: entrc_incheon.txt
done! : entrc_incheon.txt
start: entrc_jeju.txt
done! : entrc_jeju.txt
start: entrc_jeonbuk.txt
done! : entrc_jeonbuk.txt
start: entrc_jeonnam.txt
done! : entrc_jeonnam.txt
start: entrc_sejong.txt
done! : entrc_sejong.txt
start: entrc_seoul.txt
done! : entrc_seoul.txt
start: entrc_ulsan.txt
done! : entrc_ulsan.txt


In [3]:
#파일리스트 가져오기
folder_path = './gps_data_add'
file_list = os.listdir(folder_path)
len(file_list)

17

In [5]:
# 빈 리스트 생성
df_list = []

# 모든 CSV 파일 읽어서 리스트에 추가
for file in file_list:
    df = pd.read_csv(f'./gps_data_add/{file}')
    df_list.append(df)

# 모든 데이터프레임을 하나로 병합
merged_df = pd.concat(df_list, ignore_index=True)

# 병합된 CSV 파일 저장
merged_df.to_csv('merged_output.csv', index=False)

print("모든 CSV 파일이 하나의 파일로 병합되었습니다.")

모든 CSV 파일이 하나의 파일로 병합되었습니다.


In [8]:
merged_df.groupby('SIGUNGU_CD').count()

Unnamed: 0_level_0,ENT_NO,LAWDONG_CD,SIDO_NM,SIGUNGU_NM,EUPMYUNDONG_NM,DORO_CD,DORO_NM,UNDER_YN,BUILDING_NO,BUILDING_SUB_NO,BUILDING_NM,POST_NO,BUILDING_KIND,BUILDING_GROUP_YN,HAENGJUNGDONG_NM,X_LONG,Y_LAN,GPS_LON_X,GPS_LAT_Y
SIGUNGU_CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
11110,14767,14767,14767,14767,14767,14767,14767,14767,14767,14767,1643,14767,14767,14767,14767,14766,14766,14766,14766
11140,6570,6570,6570,6570,6570,6570,6570,6570,6570,6570,786,6570,6570,6570,6570,6570,6570,6570,6570
11170,18660,18660,18660,18660,18660,18660,18660,18660,18660,18660,1252,18660,18660,18660,18660,18660,18660,18660,18660
11200,9513,9513,9513,9513,9513,9513,9513,9513,9513,9513,894,9513,9513,9513,9513,9513,9513,9513,9513
11215,19833,19833,19833,19833,19833,19833,19833,19833,19833,19833,3489,19833,19833,19833,19833,19833,19833,19833,19833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52740,7238,7238,7238,7238,7238,7238,7238,7238,7238,7238,60,7238,7238,7238,7238,7238,7238,7238,7238
52750,12098,12098,12098,12098,12098,12098,12098,12098,12098,12098,57,12098,12098,12098,12098,12098,12098,12098,12098
52770,3380,3380,3380,3380,3380,3380,3380,3380,3380,3380,63,3380,3380,3380,3380,3380,3380,3380,3380
52790,22198,22198,22198,22198,22198,22198,22198,22198,22198,22198,202,22198,22198,22198,22198,22198,22198,22198,22198
