In [66]:
# encoding: utf8
from __future__ import print_function
import multiprocessing
from joblib import Parallel, delayed

import argparse
import tqdm
import os
import glob
import pandas as pd

# trade
# area_code,transaction_amount,year_of_construction,transaction_year,legal_dong,apt_name,transaction_month,transaction_day,dedicated_area,jibun,floor,해제사유발생일,해제여부,si,gu,sigungu,area,dedicated_area_level,amount_per_area,transaction_date,description
# rent
# area_code,year_of_construction,transaction_year,legal_dong,deposit,apt_name,transaction_month,monthly_rent,transaction_day,dedicated_area,jibun,floor,si,gu,sigungu,sale_type,transaction_amount,area,dedicated_area_level,amount_per_area,transaction_date,description
COLS = {}
COLS['apt-trade'] = ['si', 'gu', 'sigungu', 'legal_dong', 'apt_name', 'transaction_amount', 'transaction_date', 'description',
                     'transaction_year', 'transaction_month', 'floor', 'dedicated_area', 'year_of_construction']
COLS['apt-rent'] = ['si', 'gu', 'sigungu', 'legal_dong', 'apt_name', 'transaction_amount', 'transaction_date',
                    'transaction_year', 'transaction_month', 'floor', 'dedicated_area', 'monthly_rent', 'deposit']


def preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    # preprocessing
    df['transaction_amount'] = df['transaction_amount'].astype(float)
    # df['transaction_amount'] = df['transaction_amount'].astype(
    # float).apply(lambda x: round(x / 10000, 2))
    # 2016-05-26 0:00:00
    df['transaction_date'] = pd.to_datetime(
        df['transaction_date'], format="%Y-%m-%d %H:%M:%S").dt.date
    df['transaction_year'] = df['transaction_year'].astype(object)
    df['transaction_month'] = df['transaction_month'].astype(object)
    df['year_of_construction'] = df['year_of_construction'].astype(int)
    df['floor'] = df['floor'].astype(int)
    df['dedicated_area'] = df['dedicated_area'].astype(float)

    if 'monthly_rent' in df.columns:
        df['monthly_rent'] = df['monthly_rent'].astype(int)
    return df
 

def tempFunc(df: pd.DataFrame) -> pd.DataFrame:
    df['mean_transaction_amount'] = df['transaction_amount'].mean()
    return df


def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(
        delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)


def transaction_amount_year(df: pd.DataFrame) -> pd.DataFrame:
    df = df.set_index('apt_name')
    print(len(df))
    frames = []
    print(f' apt counts : {len(df.index)}')
    apt_names = df.index[:10]
    for apt_name in tqdm.tqdm(apt_names):
        data = df.loc[[apt_name]]
        # data = applyParallel(
        # data[['transaction_amount']].groupby(data.index), tempFunc)
        data = data[['transaction_year', 'transaction_amount']
                    ].groupby(data.index).mean()
        data = data.reset_index()
        frames.append(data)
    result_df = pd.concat(frames, axis=0)
    return result_df

In [25]:
area_code_dirs = list(filter(lambda x: 'tar' not in x, glob.glob(os.path.join('../data_in','*'))))

In [28]:
area_code_dirs

['../data_in/41135']

In [67]:
def transaction_amount_year(df: pd.DataFrame) -> pd.DataFrame:
    df = df.set_index('apt_name')
    print(len(df))
    frames = []
    print(f' apt counts : {len(df.index)}')
    apt_names = df.index[:10]
    # apt_names = df.index
    for apt_name in tqdm.tqdm(apt_names):
        data = df.loc[[apt_name]]
        # data = applyParallel(
        # data[['transaction_amount']].groupby(data.index), tempFunc)
        data = data[['transaction_year', 'transaction_amount']
                    ].groupby(data.index).mean()
        print(data)
        data = data.reset_index()
        frames.append(data)
        # print(data)
    result_df = pd.concat(frames, axis=0)
    return result_df

In [None]:
trade_type = 'apt-trade'
for area_code_dir in area_code_dirs: #tqdm.tqdm(area_code_dirs): 
    area_code = area_code_dir.split('/')[-1]
    print(area_code_dir)
    filelist = glob.glob(os.path.join(area_code_dir, '*.csv'))
    frames = []
    print(filelist[:1])
    for filepath in tqdm.tqdm(filelist):
        frame = pd.read_csv(filepath, usecols=COLS[trade_type])
        frames.append(frame)
    df = pd.concat(frames, axis=0)
    df = preprocessing(df)
    df = price_dedicatedarea_floor(df)
    df.to_csv(f'../data_out/price_dedicatedarea_floor/41135.csv', index=False)


# 연도별 평균 거래 금액

In [113]:
def transaction_amount_year(df: pd.DataFrame) -> pd.DataFrame:
    df = df.set_index('apt_name')
    print(len(df))
    frames = []
    print(f'apt counts : {len(df.index)}')
    apt_names = df.index
    # apt_names = df.index
    for apt_name in tqdm.tqdm(apt_names):
        data = df.loc[[apt_name]].reset_index()
        # data = applyParallel(
        # data[['transaction_amount']].groupby(data.index), tempFunc)
        data = data[['apt_name','transaction_year', 'transaction_amount']
                    ].groupby(['apt_name','transaction_year']).mean()
        data = data.reset_index()
        data['transaction_amount'] = data['transaction_amount'].apply(lambda x: round(x / 10000,1))
        frames.append(data)
        # print(data)
    result_df = pd.concat(frames, axis=0)
    return result_df

In [114]:
df = pd.read_csv('../data_in/41135/201604.csv', usecols=COLS[trade_type])
df = preprocessing(df)
transaction_amount_year(df)

3%|▎         | 20/609 [00:00<00:02, 197.16it/s]609
apt counts : 609
100%|██████████| 609/609 [00:02<00:00, 214.46it/s]


Unnamed: 0,apt_name,transaction_year,transaction_amount
0,샛별마을(라이프),2016,6.0
0,장안타운(건영),2016,4.7
0,장안타운(건영),2016,4.7
0,샛별마을(동성),2016,5.0
0,샛별마을(삼부),2016,4.1
...,...,...,...
0,산운마을14단지(경남아너스빌),2016,8.0
0,산운마을5단지(한성필하우스),2016,6.4
0,산운마을6단지(주공휴먼시아),2016,7.7
0,산운마을6단지(주공휴먼시아),2016,7.7


In [115]:
trade_type = 'apt-trade'
for area_code_dir in area_code_dirs: #tqdm.tqdm(area_code_dirs): 
    area_code = area_code_dir.split('/')[-1]
    print(area_code_dir)
    filelist = glob.glob(os.path.join(area_code_dir, '*.csv'))
    frames = []
    print(filelist[:1])
    for filepath in tqdm.tqdm(filelist):
        frame = pd.read_csv(filepath, usecols=COLS[trade_type])
        frames.append(frame)
    df = pd.concat(frames, axis=0)
    df = preprocessing(df)
    # df = price_dedicatedarea_floor(df)
    df = transaction_amount_year(df)
    # print(df.head(3))
    # print(len(df))
    df.to_csv(f'../data_out/apt_amount_per_year/41135.csv', index=False)
    # print('done')

10%|█         | 19/182 [00:00<00:00, 183.12it/s]../data_in/41135
['../data_in/41135/201604.csv']
100%|██████████| 182/182 [00:00<00:00, 201.70it/s]
  0%|          | 24/91030 [00:00<12:32, 120.87it/s]91030
apt counts : 91030
 65%|██████▌   | 59602/91030 [07:22<04:05, 128.19it/s]