1. '임대보증금', '임대료'의 '-'를 0으로 변환
2. '지역 == 서울특별시' 데이터 제거
3. '공급유형 == 공공임대(5년) or 공공임대(10년)' 데이터 제거
4. '상가수' 변수 생성
5. '단지내주차면수', '등록차량수'에서 '상가수'만큼 제외
6. '임대건물구분 == 상가' 데이터 제거
7. '총세대수 = SUM(전용면적별세대수)' 변수 재생성
8. '단지명' 변수 생성
9. '등록차량수/총세대수' 기준 Outlier 제거
10. '임대료' Imputation

# Load Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl ; mpl.rcParams['axes.unicode_minus'] = False
import matplotlib.pyplot as plt ; plt.rcParams['font.family'] = 'AppleGothic'
import seaborn as sns

In [2]:
train = pd.read_csv('train.csv')

# Preprocessing

In [3]:
# '임대보증금', '임대료'의 '-'를 0으로 변환
train['임대보증금'].replace([np.nan, '-'], 0, inplace=True)
train['임대보증금'] = train['임대보증금'].astype('int64')
train['임대보증금'].replace(0, np.nan, inplace=True)
train['임대료'].replace([np.nan, '-'], 0, inplace=True)
train['임대료'] = train['임대료'].astype('int64')
train['임대료'].replace(0, np.nan, inplace=True)

In [4]:
# '지역 == 서울특별시' 데이터 제거
train = train[train['지역'] != '서울특별시'].reset_index(drop=True)

In [5]:
# '공급유형 == 공공임대(5년) or 공공임대(10년)' 데이터 제거
train = train[(train['공급유형'] != '공공임대(5년)') & (train['공급유형'] != '공공임대(10년)') & (train['공급유형'] != '장기전세')].reset_index(drop=True)

In [6]:
# '상가수' 변수 생성
store = train[train['임대건물구분'] == '상가'].groupby(['단지코드'], as_index=False).agg({'임대건물구분':'count'})
store.columns = ['단지코드', '상가수']
train = pd.merge(train, store, how='left', on='단지코드')
train['상가수'] = train['상가수'].fillna(0)

In [7]:
# '단지내주차면수', '등록차량수'에서 '상가수'만큼 제외
train['단지내주차면수'] = train['단지내주차면수'] - train['상가수']
train['등록차량수'] = train['등록차량수'] - train['상가수']

In [8]:
# '임대건물구분 == 상가' 데이터 제거
train = train[train['임대건물구분'] != '상가'].reset_index(drop=True)

In [9]:
# '총세대수 = SUM(전용면적별세대수)' 변수 재생성
noh_by_area = train.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'})
noh_by_area.columns = ['단지코드', '총세대수']
train = pd.merge(left=train, right=noh_by_area, how='left', on='단지코드')
train.drop(['총세대수_x'], axis=1, inplace=True)
train.rename(columns={'총세대수_y':'총세대수'}, inplace=True)

In [10]:
# '단지명' 변수 생성
data = pd.read_csv('임대주택_단지_조회.csv')
data.drop(['순번', '전체건수', '최초입주년월'], axis=1, inplace=True)
data['전용면적'] = np.round(data['전용면적'], 2)
data['지역'] = ['세종특별자치시' if i[0] == '세종시' else i[0] for i in data['지역'].str.split()]

noh_by_area = data.groupby('단지명', as_index=False).agg({'전용면적별세대수':'sum'})
noh_by_area.columns = ['단지명', '총세대수']
data = pd.merge(left=data, right=noh_by_area, how='left', on='단지명')
data.drop(['총세대수_x'], axis=1, inplace=True)
data.rename(columns={'총세대수_y':'총세대수'}, inplace=True)

train_wo_dup = train.drop_duplicates(subset=['단지코드'])[['단지코드', '지역', '총세대수']].reset_index(drop=True)
idx = np.column_stack([train.drop_duplicates(subset=['단지코드'])[['지역', '총세대수']].reset_index(drop=True), range(len(train_wo_dup))])

train_wo_dup['단지명'] = None
for i, j, k in idx:
    if data.loc[(data['지역'] == i) & (data['총세대수'] == j), '단지명'].nunique() == 1:
        train_wo_dup.iloc[k, train_wo_dup.columns.get_loc('단지명')] = data.loc[(data['지역'] == i) & (data['총세대수'] == j), '단지명'].unique()[0]

null = train_wo_dup[train_wo_dup['단지명'].isnull()].reset_index(drop=True)
data_hhset = data[['단지명', '전용면적']].groupby(['단지명'])['전용면적'].apply(list)
data_hhset = pd.DataFrame(data_hhset)
data_hhset = pd.merge(data_hhset, data[['단지명', '지역', '총세대수']], on='단지명')
data_hhset = data_hhset[['단지명', '지역', '총세대수', '전용면적']]
data_hhset = data_hhset.drop_duplicates(['단지명']).reset_index(drop=True)

idx = null['단지코드'].unique()
for i in range(len(idx)):
    areaset_train = set(train.loc[train['단지코드'] == idx[i], '전용면적'])
    region = null.loc[i, '지역']
    total_hh2 = null.loc[i,'총세대수']
    same = data_hhset.loc[(data_hhset['지역'] == region) & (data_hhset['총세대수'] == total_hh2)].reset_index(drop=True)
    diff = []
    if len(same.index) != 0:
        for j in range(len(same)):
            areaset_data = set(same.loc[j, '전용면적'])
            diff.append(len(areaset_train - areaset_data))
        null.loc[i, '단지명'] = same.loc[np.argmin(diff), '단지명']


for i in range(len(train_wo_dup)):
    if train_wo_dup.iloc[i, train_wo_dup.columns.get_loc('단지명')] == None:
        train_wo_dup.iloc[i, train_wo_dup.columns.get_loc('단지명')] = null.loc[null['단지코드'] == train_wo_dup.iloc[i, train_wo_dup.columns.get_loc('단지코드')], '단지명']
    else:
        pass

train = pd.merge(train, train_wo_dup[['단지코드', '단지명']], how='left', on='단지코드')

code_name = train['단지코드'].unique()
for i in range(len(code_name)):
    dat = train.loc[train['단지코드'] == code_name[i],:]
    if (dat.shape[0] != dat.단지명.isnull().sum()) & (dat.shape[0] != dat.단지명.notnull().sum()):
        train.loc[train['단지코드'] == code_name[i],'단지명'] = dat.loc[dat['단지명'].notnull(),'단지명']
    else:
        pass

train.reset_index(drop=True, inplace=True)

print(round(train.loc[train['단지명'].isnull(), '단지코드'].nunique()/train['단지코드'].nunique(), 2)*100)

5.0


In [11]:
# '자격유형', '공급유형' 변수 재생성 (Major Voting)
max_code = train[['단지코드', '전용면적별세대수']].groupby(['단지코드']).max()
max_code = pd.merge(max_code, train, on=['단지코드', '전용면적별세대수'])

code_name = train['단지코드'].unique()
qualify = train['자격유형'].unique()
result_qualify = pd.DataFrame(columns=['단지코드', '자격유형'])
result_qualify['단지코드'] = train['단지코드'].unique()

for i in range(len(code_name)):
    dat= train[train['단지코드']==code_name[i]]
    num_qualified=[0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0]
    
    for j in range(len(qualify)):
        num_qualified[j]=sum(dat.loc[dat['자격유형']==qualify[j],'전용면적별세대수'])
    
    result_qualify.loc[result_qualify['단지코드']== code_name[i],'자격유형']=qualify[num_qualified.index(max(num_qualified))]

code_name= train.단지코드.unique()
supply= train.공급유형.unique()
result_supply = pd.DataFrame( columns=['단지코드', '공급유형'])
result_supply['단지코드']=train.단지코드.unique()

for i in range(len(code_name)):
    dat= train[train['단지코드']==code_name[i]]
    num_supplied=[0,0,0,0,0, 0,0,0,0,0]
    
    for j in range(len(supply)):
        num_supplied[j]=sum(dat.loc[dat['공급유형']==supply[j],'전용면적별세대수'])

    result_supply.loc[result_supply['단지코드']== code_name[i],'공급유형']=supply[num_supplied.index(max(num_supplied))]

result = pd.merge(result_supply, result_qualify, on='단지코드')
train = pd.merge(train, result, on='단지코드')
train.drop(['공급유형_x', '자격유형_x'], axis=1, inplace=True)
train.rename(columns={'공급유형_y':'공급유형', '자격유형_y':'자격유형'}, inplace=True)

In [12]:
# '등록차량수/총세대수' 기준 Outlier 제거
train['세대별차량'] = train['등록차량수'] / train['총세대수']

def det_out(lower, upper):
    return(set(train['단지코드'][train['세대별차량'] < lower]).union(set(train['단지코드'][train['세대별차량'] > upper])))

train = train[~train['단지코드'].isin(det_out(0.1,3))] # lower=0.1, upper=3 가정
train.drop(['세대별차량'], axis=1, inplace=True)

In [13]:
# '임대료' Imputation
from impyute.imputation.cs import mice
train['임대료']=train['임대료'].astype('float64')
train_num = train.loc[:,train.dtypes!=object]
train_imp = mice(train_num.values)
train_imp = pd.DataFrame(train_imp)
train_imp.columns = train_num.columns
train['임대료'] = train_imp['임대료']