In [128]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [129]:
# CSV 파일 읽기
df = pd.read_csv('./data/KC_495_LLR_ATRCTN_2023.csv')
print(df.columns)

Index(['ID', 'LCLAS_NM', 'MLSFC_NM', 'POI_ID', 'POI_NM', 'BHF_NM', 'ASSTN_NM',
       'CL_CD', 'CL_NM', 'PNU', 'CTPRVN_NM', 'SIGNGU_NM', 'LEGALDONG_NM',
       'LI_NM', 'LNBR_NO', 'LEGALDONG_CD', 'ADSTRD_CD', 'RDNMADR_CD',
       'RDNMADR_NM', 'BULD_NO', 'LC_LO', 'LC_LA', 'GID_CD', 'LAST_CHG_DE',
       'ORIGIN_NM', 'FILE_NM', 'BASE_DE'],
      dtype='object')


컬럼영문명	컬럼한글명  
ID	ID  
LCLAS_NM	대분류명  
MLSFC_NM	중분류명  
POI_ID	POI_ID  
POI_NM	POI명  
BHF_NM	지점명  
ASSTN_NM	보조명  
CL_CD	분류코드  
CL_NM	분류명  
PNU	지번고유코드  
CTPRVN_NM	시도명  
SIGNGU_NM	시군구명  
LEGALDONG_NM	법정동명  
LI_NM	리명  
LNBR_NO	번지번호  
LEGALDONG_CD	법정동코드  
ADSTRD_CD	행정동코드  
RDNMADR_CD	도로명주소코드  
RDNMADR_NM	도로명주소명  
BULD_NO	건물번호  
LC_LO	위치경도  
LC_LA	위치위도  
GID_CD	격자코드  
LAST_CHG_DE	최종변경일자  
ORIGIN_NM	출처명  
FILE_NM	파일명  
BASE_DE	기준일자

ID: Identifier for each record  
LCLAS_NM: Large classification name  
MLSFC_NM: Medium classification name  
POI_ID: Point of Interest ID  
POI_NM: Point of Interest name  
BHF_NM: Business name  
ASSTN_NM: Association name  
CL_CD: Classification code  
CL_NM: Classification name  
PNU: Parcel Numbering Unit  
CTPRVN_NM: Province name  
SIGNGU_NM: City/District name  
LEGALDONG_NM: Legal dong (neighborhood) name  
LI_NM: Li (neighborhood) name  
LNBR_NO: Land number  
LEGALDONG_CD: Legal dong (neighborhood) code  
ADSTRD_CD: Administrative district code  
RDNMADR_CD: Road name address code  
RDNMADR_NM: Road name address  
BULD_NO: Building number  
LC_LO: Longitude  
LC_LA: Latitude  
GID_CD: Group ID code  
LAST_CHG_DE: Last change date  
ORIGIN_NM: Original name  
FILE_NM: File name  
BASE_DE: Base date  

We can calculate the distance using cloumn LC_LO & LC_LA

In [130]:
df.head(3)

Unnamed: 0,ID,LCLAS_NM,MLSFC_NM,POI_ID,POI_NM,BHF_NM,ASSTN_NM,CL_CD,CL_NM,PNU,...,RDNMADR_CD,RDNMADR_NM,BULD_NO,LC_LO,LC_LA,GID_CD,LAST_CHG_DE,ORIGIN_NM,FILE_NM,BASE_DE
0,KCLANPO23N000000001,장소,관광지,76838,간절곶등대,,,60513,일반관광지,3171031025100280001,...,317104300000.0,간절곶1길,39-2,129.360675,35.359029,마마690086,20240108,KT,KC_495_LLR_ATRCTN_2023,20231220
1,KCLANPO23N000000002,장소,관광지,141502,우성항공여행사,,,60403,항공사/여행사,4579025021102250000,...,457903300000.0,중앙로,232,126.700823,35.43479,다마274157,20240108,KT,KC_495_LLR_ATRCTN_2023,20231220
2,KCLANPO23N000000003,장소,관광지,142394,우정여행사,,,60403,항공사/여행사,1114010300100310023,...,111404100000.0,세종대로20길,15,126.978339,37.567237,다사539521,20240108,KT,KC_495_LLR_ATRCTN_2023,20231220


In [131]:
columns_to_drop = ['ID', 'LCLAS_NM', 'MLSFC_NM', 'BHF_NM', 'ASSTN_NM', 'GID_CD', 'LAST_CHG_DE', 'ORIGIN_NM', 'FILE_NM','LI_NM', 'RDNMADR_CD', 'RDNMADR_NM', 'BULD_NO']

df.drop(columns=columns_to_drop, inplace=True)
df.head(3)

Unnamed: 0,POI_ID,POI_NM,CL_CD,CL_NM,PNU,CTPRVN_NM,SIGNGU_NM,LEGALDONG_NM,LNBR_NO,LEGALDONG_CD,ADSTRD_CD,LC_LO,LC_LA,BASE_DE
0,76838,간절곶등대,60513,일반관광지,3171031025100280001,울산광역시,울주군,서생면,28-1,3171031025,3171031000,129.360675,35.359029,20231220
1,141502,우성항공여행사,60403,항공사/여행사,4579025021102250000,전라북도,고창군,고창읍,225,4579025021,4579025000,126.700823,35.43479,20231220
2,142394,우정여행사,60403,항공사/여행사,1114010300100310023,서울특별시,중구,태평로1가,31-23,1114010300,1114055000,126.978339,37.567237,20231220


In [132]:
print(df['CTPRVN_NM'].unique())
print()
print(df['CL_NM'].unique())


['울산광역시' '전라북도' '서울특별시' '부산광역시' '강원특별자치도' '전라남도' '경상남도' '제주특별자치도' '대전광역시'
 '광주광역시' '경기도' '인천광역시' '충청남도' '충청북도' '대구광역시' '경상북도' '세종특별자치시']

['일반관광지' '항공사/여행사' '천연기념물' '관광안내소/매표소' '유명사적/유적지' '영어마을' '먹거리/패션거리' '동물원'
 '테마공원/대형놀이공원' 'N' '비/탑/문/각' '야영장' '식물원' '관광농원/허브마을' '팜스테이' '유명관광지'
 '드라마/영화촬영지' '일반유원지/일반놀이공원' '국보' '캠핑장' '폭포/계곡' '서원/향교/서당' '정보화마을'
 '휴양림/수목원' '지역축제' '고택/생가/민속마을' '해수욕장' '보물' '왕릉/고분' '아쿠아리움/대형수족관' '온천지역'
 '궁궐/종묘' '성/성터' '캠핑홀리데이(캠핑)' '글램핑코리아(캠핑)' '잼핑홀리데이(캠핑)']


In [133]:

# # 'N'를 제외한 행만 남기기
desired_values = ['먹거리/패션거리' , '유명관광지' ,'해수욕장']
df = df[df['CL_NM'].isin(desired_values)]
# df = df[df['CL_NM'] == 'N']

# columns_to_drop = ['CL_CD', 'CL_NM']
# df.drop(columns=columns_to_drop, inplace=True)
# '서울특별시', '경기도', '부산광역시' 제외한 행만 남기기
desired_values = ['서울특별시', '경기도', '부산광역시']
df = df[df['CTPRVN_NM'].isin(desired_values)]
# df_test = df[~df['CTPRVN_NM'].isin(desired_values)]

selected_columns = ['POI_ID', 'POI_NM', 'CTPRVN_NM', 'SIGNGU_NM','LC_LO', 'LC_LA']
df_selected = df[selected_columns]


In [134]:
desired_values = [16758706, 21085686, 21085658, 7232777, 21085883,21086097,7231419, 21085594, 20868088, 10592447, 10593745]
df_small = df_selected[df_selected['POI_ID'].isin(desired_values)]

In [135]:
df_small

Unnamed: 0,POI_ID,POI_NM,CTPRVN_NM,SIGNGU_NM,LC_LO,LC_LA
922,7231419,임랑해수욕장,부산광역시,기장군,129.264261,35.318694
1005,7232777,오이도해양단지,경기도,시흥시,126.690293,37.346403
15957,16758706,구로시장떡볶이골목,서울특별시,구로구,126.885543,37.488818
34604,20868088,덕천동젊음의거리,부산광역시,북구,129.00709,35.209896
37393,21085594,광안리카페거리,부산광역시,수영구,129.116387,35.150916
37453,21085658,동대문패션타운,서울특별시,중구,127.008662,37.568664
37481,21085686,을지로타일상가,서울특별시,중구,126.98898,37.566375
37664,21085883,사강회센터,경기도,화성시,126.736237,37.213046
37874,21086097,까치울음식테마마을,경기도,부천시,126.813806,37.51514


In [136]:
df_train, df_test = train_test_split(df_selected, test_size=0.3, random_state=42)

## Save the filtered data to a new CSV file

In [137]:
df_small.to_csv("./data/small_set.csv", sep=',', index=False)

In [138]:
df_train.to_csv("./data/train_set.csv", sep=',', index=False)
df_test.to_csv("./data/test_set.csv", sep=',', index=False)