In [1]:
import os
import re
import numpy as np
import pandas as pd

In [10]:
# 원본 엑셀 파일 리스트 읽어오기, raw-files 폴더
RAW_FILES_PATH = 'raw-files'

raw_files = os.listdir(RAW_FILES_PATH)
raw_files = [ f'{RAW_FILES_PATH}/{r}' for r in raw_files ]
raw_files[0], len(raw_files)

('raw-files/~$아파트(매매)_실거래가_201208.xlsx', 97)

In [3]:
# 원본 엑셀 데이터 컬럼명
RAW_FILE_COLUMNS = ['region', 'street_num', 'street_num1', 'street_num2',
                    'apt_name', 'apt_size', 'contract_yymm', 'contract_dd',
                    'contract_price', 'apt_floor','completion_year','street_name']

In [4]:
# data-files 폴더 만들기
DATA_FILES_PATH = 'data-files'

if not(os.path.isdir(DATA_FILES_PATH)):
    os.makedirs(os.path.join(DATA_FILES_PATH))

In [11]:
# 기록할 파일 리스트 만들기
data_files = []
for rf in raw_files:
    p = re.compile('[0-9]+')
    date_name = p.findall(rf)
    data_files.append(f'{DATA_FILES_PATH}/{date_name[0]}.csv')

# files_zip = zip(raw_files, data_files)
data_files[0], len(data_files)

('data-files/201208.csv', 97)

In [6]:
# 수정 CSV 데이터 컬럼명
DATA_FILE_COLUMNS = ['region', 'region_sub', 'apt_name', 'apt_size',
                     'apt_floor', 'contract_date', 'contract_price',
                     'completion_year', 'street_num', 'street_name']

# region = 도, 광역시, 특별시
# region_sub = 시군구
# apt_name = 단지명
# apt_size = 전용면적
# apt_floor = 층
# contract_date = 계약년월일
# contract_price = 거래금액
# completion_year = 건축년도
# street_num = 번지
# street_name = 도로명


In [7]:
# 엑셀 파일 로드 테스트
fileD = pd.read_excel(raw_files[0], index_col=None, header=None, skiprows=17)
fileD.columns = RAW_FILE_COLUMNS

fileD.info()
fileD.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26054 entries, 0 to 26053
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   region           26054 non-null  object 
 1   street_num       26054 non-null  object 
 2   street_num1      26054 non-null  int64  
 3   street_num2      26054 non-null  int64  
 4   apt_name         26054 non-null  object 
 5   apt_size         26054 non-null  float64
 6   contract_yymm    26054 non-null  int64  
 7   contract_dd      26054 non-null  int64  
 8   contract_price   26054 non-null  object 
 9   apt_floor        26054 non-null  int64  
 10  completion_year  26054 non-null  int64  
 11  street_name      26054 non-null  object 
dtypes: float64(1), int64(6), object(5)
memory usage: 2.4+ MB


Unnamed: 0,region,street_num,street_num1,street_num2,apt_name,apt_size,contract_yymm,contract_dd,contract_price,apt_floor,completion_year,street_name
0,강원도 강릉시 견소동,202,202,0,송정한신,84.945,201208,24,13000,5,1997,경강로2539번길 8
1,강원도 강릉시 견소동,202,202,0,송정한신,59.8,201208,27,8500,15,1997,경강로2539번길 8
2,강원도 강릉시 견소동,202,202,0,송정한신,59.8,201208,31,6000,1,1997,경강로2539번길 8
3,강원도 강릉시 견소동,289,289,0,송정해변신도브래뉴아파트,84.99,201208,8,15000,7,2005,경강로2539번길 22
4,강원도 강릉시 견소동,289,289,0,송정해변신도브래뉴아파트,84.99,201208,9,16500,9,2005,경강로2539번길 22


In [7]:
# 컬럼 추가 및 데이터 수정 함수
def buildRawD(D=None):
    if D is None:
        return 0

    region_str = D['region'].split(" ", 1)
    
    D['region'] = region_str[0]
    D['region_sub'] = region_str[1]

    D['apt_size'] = f'{D["apt_size"]:#.02f}'

    # yyyy-mm-dd 형식 변환
    yymm = str(D['contract_yymm'])
    D['contract_date'] = f'{yymm[:4]}-{yymm[4:]}-{D["contract_dd"]:#02d}'
    
    # ',' 제거, (만원) -> 0000 추가
    D['contract_price'] = D['contract_price'].replace(',', '') + '0000'

    return D


In [9]:
# 원본 데이터 수정
rawD = fileD.apply(buildRawD, axis=1)

rawD.info()
rawD.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26054 entries, 0 to 26053
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   region           26054 non-null  object
 1   street_num       26054 non-null  object
 2   street_num1      26054 non-null  int64 
 3   street_num2      26054 non-null  int64 
 4   apt_name         26054 non-null  object
 5   apt_size         26054 non-null  object
 6   contract_yymm    26054 non-null  int64 
 7   contract_dd      26054 non-null  int64 
 8   contract_price   26054 non-null  object
 9   apt_floor        26054 non-null  int64 
 10  completion_year  26054 non-null  int64 
 11  street_name      26054 non-null  object
 12  region_sub       26054 non-null  object
 13  contract_date    26054 non-null  object
dtypes: int64(6), object(8)
memory usage: 2.8+ MB


Unnamed: 0,region,street_num,street_num1,street_num2,apt_name,apt_size,contract_yymm,contract_dd,contract_price,apt_floor,completion_year,street_name,region_sub,contract_date
0,강원도,202,202,0,송정한신,84.94,201208,24,130000000,5,1997,경강로2539번길 8,강릉시 견소동,2012-08-24
1,강원도,202,202,0,송정한신,59.8,201208,27,85000000,15,1997,경강로2539번길 8,강릉시 견소동,2012-08-27
2,강원도,202,202,0,송정한신,59.8,201208,31,60000000,1,1997,경강로2539번길 8,강릉시 견소동,2012-08-31
3,강원도,289,289,0,송정해변신도브래뉴아파트,84.99,201208,8,150000000,7,2005,경강로2539번길 22,강릉시 견소동,2012-08-08
4,강원도,289,289,0,송정해변신도브래뉴아파트,84.99,201208,9,165000000,9,2005,경강로2539번길 22,강릉시 견소동,2012-08-09


In [10]:

# 필요 데이터 선택 및 컬럼 재배치, 데이터 변환하기
dataD = rawD.loc[:, DATA_FILE_COLUMNS]

dataD.info()
dataD.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26054 entries, 0 to 26053
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   region           26054 non-null  object
 1   region_sub       26054 non-null  object
 2   apt_name         26054 non-null  object
 3   apt_size         26054 non-null  object
 4   apt_floor        26054 non-null  int64 
 5   contract_date    26054 non-null  object
 6   contract_price   26054 non-null  object
 7   completion_year  26054 non-null  int64 
 8   street_num       26054 non-null  object
 9   street_name      26054 non-null  object
dtypes: int64(2), object(8)
memory usage: 2.0+ MB


Unnamed: 0,region,region_sub,apt_name,apt_size,apt_floor,contract_date,contract_price,completion_year,street_num,street_name
0,강원도,강릉시 견소동,송정한신,84.94,5,2012-08-24,130000000,1997,202,경강로2539번길 8
1,강원도,강릉시 견소동,송정한신,59.8,15,2012-08-27,85000000,1997,202,경강로2539번길 8
2,강원도,강릉시 견소동,송정한신,59.8,1,2012-08-31,60000000,1997,202,경강로2539번길 8
3,강원도,강릉시 견소동,송정해변신도브래뉴아파트,84.99,7,2012-08-08,150000000,2005,289,경강로2539번길 22
4,강원도,강릉시 견소동,송정해변신도브래뉴아파트,84.99,9,2012-08-09,165000000,2005,289,경강로2539번길 22


In [11]:
# data-files 폴더에 수정한 데이터 csv 파일 저장
dataD.to_csv(data_files[0], mode='w', header=True, index=False)

In [8]:
# 파일 일괄 수정 및 저장 함수
def make_new_files(files_zip):
    
    total_count = 0
    for raw_path, data_path in files_zip:
        fileD = pd.read_excel(raw_path, index_col=None, header=None, skiprows=17)
        fileD.columns = RAW_FILE_COLUMNS
        
        rawD = fileD.apply(buildRawD, axis=1)
        dataD = rawD.loc[:, DATA_FILE_COLUMNS]

        dataD.to_csv(data_path, mode='w', header=True, index=False)

        data_count = len(dataD)
        total_count += data_count
        print(f'complete {data_path}, data count: {data_count}')
    
    print(f'total data count: {total_count}')

In [9]:
# 일괄 파일 만들기
# data-files 폴더 지우고 실행
# 소요 시간 3시간 15분!!!
# total data count: 4800528

# make_new_files(zip(raw_files, data_files))

complete data-files/201208.csv, data count: 26054
complete data-files/201209.csv, data count: 34032
complete data-files/201210.csv, data count: 45854
complete data-files/201211.csv, data count: 42309
complete data-files/201212.csv, data count: 37772
complete data-files/201301.csv, data count: 31650
complete data-files/201302.csv, data count: 38174
complete data-files/201303.csv, data count: 53797
complete data-files/201304.csv, data count: 56387
complete data-files/201305.csv, data count: 53053
complete data-files/201306.csv, data count: 49828
complete data-files/201307.csv, data count: 31438
complete data-files/201308.csv, data count: 41045
complete data-files/201309.csv, data count: 51796
complete data-files/201310.csv, data count: 61687
complete data-files/201311.csv, data count: 45881
complete data-files/201312.csv, data count: 48839
complete data-files/201401.csv, data count: 48870
complete data-files/201402.csv, data count: 58396
complete data-files/201403.csv, data count: 62068


In [6]:
# 생성 파일 로드 테스트
testD = pd.read_csv(data_files[0])
testD.info()
testD.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26054 entries, 0 to 26053
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   region           26054 non-null  object 
 1   region_sub       26054 non-null  object 
 2   apt_name         26054 non-null  object 
 3   apt_size         26054 non-null  float64
 4   apt_floor        26054 non-null  int64  
 5   contract_date    26054 non-null  object 
 6   contract_price   26054 non-null  int64  
 7   completion_year  26054 non-null  int64  
 8   street_num       26054 non-null  object 
 9   street_name      26054 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 2.0+ MB


Unnamed: 0,region,region_sub,apt_name,apt_size,apt_floor,contract_date,contract_price,completion_year,street_num,street_name
0,강원도,강릉시 견소동,송정한신,84.94,5,2012-08-24,130000000,1997,202,경강로2539번길 8
1,강원도,강릉시 견소동,송정한신,59.8,15,2012-08-27,85000000,1997,202,경강로2539번길 8
2,강원도,강릉시 견소동,송정한신,59.8,1,2012-08-31,60000000,1997,202,경강로2539번길 8
3,강원도,강릉시 견소동,송정해변신도브래뉴아파트,84.99,7,2012-08-08,150000000,2005,289,경강로2539번길 22
4,강원도,강릉시 견소동,송정해변신도브래뉴아파트,84.99,9,2012-08-09,165000000,2005,289,경강로2539번길 22


In [7]:
testD['region'].unique()

array(['강원도', '경기도', '경상남도', '경상북도', '광주광역시', '대구광역시', '대전광역시', '부산광역시',
       '서울특별시', '세종특별자치시', '울산광역시', '인천광역시', '전라남도', '전라북도', '제주특별자치도',
       '충청남도', '충청북도'], dtype=object)