In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium 
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from glob import glob
import os
pbar = ProgressBar()
pbar.register()

from tqdm import tqdm_notebook as tqdm
import geopandas as gpd

### 좌표계 변환

In [None]:
from pyproj import Proj, transform
proj_EPSG_5186 = Proj(init = 'epsg:5186')
# proj_WGS84 = Proj(init = 'epsg:4326')
proj_EPSG_5179 = Proj(init = 'epsg:5179')

def transform_axis(df):
    return pd.Series(transform(proj_EPSG_5186, proj_EPSG_5179,df['X_AXIS'],df['Y_AXIS']),index = ['X','Y'])


In [None]:
def change_axis(path):
    '''
    결측제거후 좌표변환하여 저장 
    '''
    file_name = path.split("\\")[1]
    
    sample = pd.read_csv(path)
    sample = sample[(sample.WEEKDAY != '\\N')&(sample.WEEKEND != '\\N')]
    
    only_axis = sample[['X_AXIS','Y_AXIS']].drop_duplicates()
    only_axis[['new_X','new_y']] = only_axis.apply(transform_axis,axis=1)
    
    sample_new = pd.merge(sample,only_axis,on = ['X_AXIS','Y_AXIS'], how = 'left')
    sample_new.drop(['X_AXIS','Y_AXIS'],axis = 1, inplace = True)
    #저장
    sample_new.to_csv('./data/KT 데이터/resi(new)'+"/5179_"+file_name,index= False)

In [None]:
paths = glob('./data/KT 데이터/resi/*.csv')

In [None]:
for path in tqdm(paths):
    change_axis(path)

In [None]:
paths = glob(f'./data/KT 데이터/resi(new)/{year}/*.csv')

In [None]:
#통합파일 생성
for year in tqdm([2016,2017]):
    total_df = pd.DataFrame()
    paths = glob(f'./data/KT 데이터/resi(new)/{year}/*.csv')
    for path in tqdm(paths):
        sample = pd.read_csv(path)
        total_df = pd.concat([total_df,sample])
    total_df.to_csv(f'./data/KT 데이터/{year}_resi통합.csv')

In [None]:
%%time
resi_2016 = pd.read_csv('./data/KT 데이터/2016_resi통합.csv')
resi_2017 = pd.read_csv('./data/KT 데이터/2017_resi통합.csv')

In [None]:
def near_park(df):
    '''범위내 축소 
    
    '''
    # 대공원주변 범위
    ymin,ymax = 1949300, 1951100
    xmin,xmax = 962000, 964100
    
    target = df[(df.new_X >= xmin)&(df.new_X <= xmax)]
    target = target[(target.new_y >= ymin)&(target.new_y <= ymax)]
    
    #건대상권 제외
    gun_x, gun_y = 962250, 1949600
    target = target[~((target.new_X <= gun_x)&((target.new_y <= gun_y)))]
    
    return target

In [None]:
%%time
resi_2016_np = near_park(resi_2016.iloc[:,1:])
resi_2017_np = near_park(resi_2017.iloc[:,1:])

In [None]:
resi_2016_np.shape,resi_2016.shape

In [None]:
#11개 구역 클러스터링 한 내역
label_line = gpd.read_file('E:/python/data/신한카드데이터/라벨별_명칭_geo값.shp')

In [None]:
label_line.plot(color= 'w',edgecolor = 'black')
dd = resi_2016_np[['new_X','new_y']].drop_duplicates()
plt.scatter(x = "new_X",y = 'new_y',data= dd,alpha = 0.3)

In [None]:
def add_label(df):
    '''
    라벨 추가
    '''
    label_line = gpd.read_file('E:/python/data/신한카드데이터/라벨별_명칭_geo값.shp')
    from shapely.geometry import Point, Polygon
    
    #라벨딕셔너리 생성
    label_dict = label_line.set_index('names')
    del label_dict['labels']
    label_dict = label_dict.to_dict()['geometry']
    
    # 중복제거하고 확인
    origin_axis = df[['new_X','new_y']].drop_duplicates()
    
    #라벨링
    label_names =[]
    for i in range(0,len(origin_axis)):
        label = np.nan
        for key in label_dict:
            check_point = Point(origin_axis['new_X'].iloc[i],origin_axis['new_y'].iloc[i])
            if label_dict[key].contains(check_point):
                label = key
        label_names.append(label)  
    origin_axis['label'] = label_names
    
    #매칭안되는 부분 제외(범위밖)
    origin_axis = origin_axis[~origin_axis.label.isna()]

    label_df = pd.merge(df, origin_axis, how = 'left', on = ['new_X','new_y'])
    label_df = label_df.dropna()
    return label_df 

In [None]:
%%time
resi_2016_l = add_label(resi_2016_np)
resi_2017_l = add_label(resi_2017_np)

In [None]:
def comefrom_rate_b(df):
    year = str(df['YYYYMM'].iloc[0])[:4]
    #저장위치
    path = './output/KT/유입지비율'
    os.makedirs(path,exist_ok = True)
    #전체로 확인
    full_df = df.groupby(['label','FROM_ID'])['WEEKDAY','WEEKEND'].sum().reset_index()
    #행정동 디코드
    full_df.FROM_ID = full_df.FROM_ID.apply(lambda x: int(str(x).split('.')[0].ljust(8,'0')))
    full_df_m = pd.merge(full_df,n_h_dong, how = 'left', left_on = 'FROM_ID', right_on = 'dong_code')

    for label in full_df_m.label.unique():
        sample = full_df_m[full_df_m.label == label]
        for c in ['WEEKDAY','WEEKEND']:
            sample_part = sample.set_index('address')[c]
            sample_part = round((sample_part/sample_part.sum())*100,2).sort_values(ascending= False)
            sample_part.to_csv(f'{path}/{year}_{label}_{c}에 따른 유입지비율.csv',encoding = 'cp949')

In [None]:
comefrom_rate_b(resi_2016_l)
comefrom_rate_b(resi_2017_l)