# Train Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl ; mpl.rcParams['axes.unicode_minus'] = False
import matplotlib.pyplot as plt ; plt.rcParams['font.family'] = 'AppleGothic'
import seaborn as sns

In [2]:
train = pd.read_csv('train.csv')

In [3]:
# '임대보증금', '임대료'의 '-'를 0으로 변환
train['임대보증금'].replace([np.nan, '-'], 0, inplace=True)
train['임대보증금'] = train['임대보증금'].astype('int64')
train['임대보증금'].replace(0, np.nan, inplace=True)
train['임대료'].replace([np.nan, '-'], 0, inplace=True)
train['임대료'] = train['임대료'].astype('int64')
train['임대료'].replace(0, np.nan, inplace=True)

In [4]:
# '지역 == 서울특별시' 데이터 제거
train = train[train['지역'] != '서울특별시'].reset_index(drop=True)

In [5]:
# '공급유형 == 공공임대(5년) or 공공임대(10년)' 데이터 제거
train = train[(train['공급유형'] != '공공임대(5년)') & (train['공급유형'] != '공공임대(10년)') & (train['공급유형'] != '장기전세')].reset_index(drop=True)

In [6]:
# '상가수' 변수 생성
store = train[train['임대건물구분'] == '상가'].groupby(['단지코드'], as_index=False).agg({'임대건물구분':'count'})
store.columns = ['단지코드', '상가수']
train = pd.merge(train, store, how='left', on='단지코드')
train['상가수'] = train['상가수'].fillna(0)

In [7]:
# '단지내주차면수', '등록차량수'에서 '상가수'만큼 제외
train['단지내주차면수'] = train['단지내주차면수'] - train['상가수']
train['등록차량수'] = train['등록차량수'] - train['상가수']

In [8]:
# '임대건물구분 == 상가' 데이터 제거
train = train[train['임대건물구분'] != '상가'].reset_index(drop=True)

In [9]:
# '총세대수 = SUM(전용면적별세대수)' 변수 재생성
noh_by_area = train.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'})
noh_by_area.columns = ['단지코드', '총세대수']
train = pd.merge(left=train, right=noh_by_area, how='left', on='단지코드')
train.drop(['총세대수_x'], axis=1, inplace=True)
train.rename(columns={'총세대수_y':'총세대수'}, inplace=True)

In [10]:
# '단지명' 변수 생성
data = pd.read_csv('임대주택_단지_조회.csv')
data.drop(['순번', '전체건수', '최초입주년월'], axis=1, inplace=True)
data['전용면적'] = np.round(data['전용면적'], 2)
data['지역'] = ['세종특별자치시' if i[0] == '세종시' else i[0] for i in data['지역'].str.split()]

noh_by_area = data.groupby('단지명', as_index=False).agg({'전용면적별세대수':'sum'})
noh_by_area.columns = ['단지명', '총세대수']
data = pd.merge(left=data, right=noh_by_area, how='left', on='단지명')
data.drop(['총세대수_x'], axis=1, inplace=True)
data.rename(columns={'총세대수_y':'총세대수'}, inplace=True)

train_wo_dup = train.drop_duplicates(subset=['단지코드'])[['단지코드', '지역', '총세대수']].reset_index(drop=True)
idx = np.column_stack([train.drop_duplicates(subset=['단지코드'])[['지역', '총세대수']].reset_index(drop=True), range(len(train_wo_dup))])

train_wo_dup['단지명'] = None
for i, j, k in idx:
    if data.loc[(data['지역'] == i) & (data['총세대수'] == j), '단지명'].nunique() == 1:
        train_wo_dup.iloc[k, train_wo_dup.columns.get_loc('단지명')] = data.loc[(data['지역'] == i) & (data['총세대수'] == j), '단지명'].unique()[0]

null = train_wo_dup[train_wo_dup['단지명'].isnull()].reset_index(drop=True)
data_hhset = data[['단지명', '전용면적']].groupby(['단지명'])['전용면적'].apply(list)
data_hhset = pd.DataFrame(data_hhset)
data_hhset = pd.merge(data_hhset, data[['단지명', '지역', '총세대수']], on='단지명')
data_hhset = data_hhset[['단지명', '지역', '총세대수', '전용면적']]
data_hhset = data_hhset.drop_duplicates(['단지명']).reset_index(drop=True)

idx = null['단지코드'].unique()
for i in range(len(idx)):
    areaset_train = set(train.loc[train['단지코드'] == idx[i], '전용면적'])
    region = null.loc[i, '지역']
    total_hh2 = null.loc[i,'총세대수']
    same = data_hhset.loc[(data_hhset['지역'] == region) & (data_hhset['총세대수'] == total_hh2)].reset_index(drop=True)
    diff = []
    if len(same.index) != 0:
        for j in range(len(same)):
            areaset_data = set(same.loc[j, '전용면적'])
            diff.append(len(areaset_train - areaset_data))
        null.loc[i, '단지명'] = same.loc[np.argmin(diff), '단지명']


for i in range(len(train_wo_dup)):
    if train_wo_dup.iloc[i, train_wo_dup.columns.get_loc('단지명')] == None:
        train_wo_dup.iloc[i, train_wo_dup.columns.get_loc('단지명')] = null.loc[null['단지코드'] == train_wo_dup.iloc[i, train_wo_dup.columns.get_loc('단지코드')], '단지명']
    else:
        pass

train = pd.merge(train, train_wo_dup[['단지코드', '단지명']], how='left', on='단지코드')

code_name = train['단지코드'].unique()
for i in range(len(code_name)):
    dat = train.loc[train['단지코드'] == code_name[i],:]
    if (dat.shape[0] != dat.단지명.isnull().sum()) & (dat.shape[0] != dat.단지명.notnull().sum()):
        train.loc[train['단지코드'] == code_name[i],'단지명'] = dat.loc[dat['단지명'].notnull(),'단지명']
    else:
        pass

train.reset_index(drop=True, inplace=True)

print(round(train.loc[train['단지명'].isnull(), '단지코드'].nunique()/train['단지코드'].nunique(), 2)*100)

5.0


In [11]:
# '자격유형', '공급유형' 변수 재생성 (Major Voting)
max_code = train[['단지코드', '전용면적별세대수']].groupby(['단지코드']).max()
max_code = pd.merge(max_code, train, on=['단지코드', '전용면적별세대수'])

code_name = train['단지코드'].unique()
qualify = train['자격유형'].unique()
result_qualify = pd.DataFrame(columns=['단지코드', '자격유형'])
result_qualify['단지코드'] = train['단지코드'].unique()

for i in range(len(code_name)):
    dat = train[train['단지코드'] == code_name[i]]
    num_qualified = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    
    for j in range(len(qualify)):
        num_qualified[j] = sum(dat.loc[dat['자격유형'] == qualify[j],'전용면적별세대수'])
    
    result_qualify.loc[result_qualify['단지코드'] == code_name[i],'자격유형'] = qualify[num_qualified.index(max(num_qualified))]

code_name = train.단지코드.unique()
supply = train.공급유형.unique()
result_supply = pd.DataFrame(columns=['단지코드', '공급유형'])
result_supply['단지코드'] = train.단지코드.unique()

for i in range(len(code_name)):
    dat= train[train['단지코드'] == code_name[i]]
    num_supplied=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    
    for j in range(len(supply)):
        num_supplied[j] = sum(dat.loc[dat['공급유형'] == supply[j],'전용면적별세대수'])

    result_supply.loc[result_supply['단지코드'] == code_name[i],'공급유형'] = supply[num_supplied.index(max(num_supplied))]

result = pd.merge(result_supply, result_qualify, on='단지코드')
train = pd.merge(train, result, on='단지코드')
train.drop(['공급유형_x', '자격유형_x'], axis=1, inplace=True)
train.rename(columns={'공급유형_y':'공급유형', '자격유형_y':'자격유형'}, inplace=True)

In [12]:
# '등록차량수/총세대수' 기준 Outlier 제거
train['세대별차량'] = train['등록차량수'] / train['총세대수']

def det_out(lower, upper):
    return(set(train['단지코드'][train['세대별차량'] < lower]).union(set(train['단지코드'][train['세대별차량'] > upper])))

train = train[~train['단지코드'].isin(det_out(0.1,3))] # lower=0.1, upper=3 가정
train.drop(['세대별차량'], axis=1, inplace=True)

In [13]:
train.reset_index(inplace=True, drop=True)

In [14]:
# '임대료' Imputation
from impyute.imputation.cs import mice
train['임대료']=train['임대료'].astype('float64')
train_num = train.loc[:,train.dtypes!=object]
train_imp = mice(train_num.values)
train_imp = pd.DataFrame(train_imp)
train_imp.columns = train_num.columns
train['임대료'] = train_imp['임대료']

In [15]:
# '소형세대', '중형세대', '대형세대' 변수 생성
size= pd.DataFrame(columns=['단지코드', '소형세대', '중형세대', '대형세대'])
size['단지코드'] = train.단지코드.unique()
code=size['단지코드']

for i in range(len(code)):
    house = train[train['단지코드'] == code[i]]
    size.loc[i, '소형세대'] = sum(house.loc[house['전용면적'] < 40,'전용면적별세대수'])
    size.loc[i, '중형세대'] = sum(house.loc[(house['전용면적'] >= 40) & (house['전용면적'] < 80) ,'전용면적별세대수'])
    size.loc[i, '대형세대'] = sum(house.loc[house['전용면적'] >= 80,'전용면적별세대수'])

size['소형세대'] = size['소형세대'].astype('int64')
size['중형세대'] = size['중형세대'].astype('int64')
size['대형세대'] = size['대형세대'].astype('int64')

train = pd.merge(train, size, on='단지코드', how='left')
train['소형세대'] = train['소형세대'] / train['총세대수']
train['중형세대'] = train['중형세대'] / train['총세대수']
train['대형세대'] = train['대형세대'] / train['총세대수']

In [16]:
# '임대료' 변수 재생성 (전용면적별세대수 고려)
train['임대료'] = train['전용면적별세대수'] * train['임대료']
rental_fee = train.groupby('단지코드', as_index=False).agg({'임대료':'sum'})
train = pd.merge(train, rental_fee, on='단지코드')
train.drop(['임대료_x'], axis=1, inplace=True)
train.rename(columns={'임대료_y':'임대료'}, inplace=True)
train['임대료'] = round(train['임대료']/train['총세대수'], 2)

In [17]:
train['도보 10분거리 내 지하철역 수(환승노선 수 반영)'] = train['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].fillna(0)

In [18]:
train.drop(['임대건물구분', '전용면적', '전용면적별세대수', '임대보증금'], axis=1, inplace=True)

In [19]:
train = train.drop_duplicates().reset_index(drop=True)

In [20]:
train.head()

Unnamed: 0,단지코드,지역,공가수,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수,상가수,총세대수,단지명,공급유형,자격유형,소형세대,중형세대,대형세대,임대료
0,C2483,경상북도,38.0,0.0,3.0,1425.0,1015.0,0.0,900,포항장량 1단지,국민임대,A,0.165556,0.834444,0.0,173838.88
1,C2515,경상남도,17.0,0.0,3.0,624.0,205.0,0.0,545,통영미수2단지,국민임대,A,0.653211,0.346789,0.0,119431.27
2,C1407,대전광역시,13.0,1.0,1.0,1285.0,1064.0,0.0,1216,"대전도안서남부4bl(06,기) 4단지",국민임대,A,0.320724,0.679276,0.0,171132.24
3,C1945,경기도,6.0,1.0,3.0,734.0,730.0,0.0,755,남양주호평6단지,국민임대,B,0.317881,0.682119,0.0,187414.21
4,C1470,전라북도,14.0,0.0,2.0,645.0,553.0,0.0,696,"익산장신(03,주2) 1단지",국민임대,A,0.364943,0.635057,0.0,125449.22


---

# Test Data Preprocessing

In [21]:
test = pd.read_csv('test.csv')

In [22]:
# '임대보증금', '임대료'의 '-'를 0으로 변환
test['임대보증금'].replace([np.nan, '-'], 0, inplace=True)
test['임대보증금'] = test['임대보증금'].astype('int64')
test['임대보증금'].replace(0, np.nan, inplace=True)
test['임대료'].replace([np.nan, '-'], 0, inplace=True)
test['임대료'] = test['임대료'].astype('int64')
test['임대료'].replace(0, np.nan, inplace=True)

In [23]:
# '상가수' 변수 생성
store = test[test['임대건물구분'] == '상가'].groupby(['단지코드'], as_index=False).agg({'임대건물구분':'count'})
store.columns = ['단지코드', '상가수']
test = pd.merge(test, store, how='left', on='단지코드')
test['상가수'] = test['상가수'].fillna(0)

In [24]:
# '단지내주차면수', '등록차량수'에서 '상가수'만큼 제외
test['단지내주차면수'] = test['단지내주차면수'] - test['상가수']

In [25]:
# '임대건물구분 == 상가' 데이터 제거
test = test[test['임대건물구분'] != '상가'].reset_index(drop=True)

In [26]:
# '총세대수 = SUM(전용면적별세대수)' 변수 재생성
noh_by_area = test.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'})
noh_by_area.columns = ['단지코드', '총세대수']
test = pd.merge(left=test, right=noh_by_area, how='left', on='단지코드')
test.drop(['총세대수_x'], axis=1, inplace=True)
test.rename(columns={'총세대수_y':'총세대수'}, inplace=True)

In [27]:
# '단지명' 변수 생성
data = pd.read_csv('임대주택_단지_조회.csv')
data.drop(['순번', '전체건수', '최초입주년월'], axis=1, inplace=True)
data['전용면적'] = np.round(data['전용면적'], 2)
data['지역'] = ['세종특별자치시' if i[0] == '세종시' else i[0] for i in data['지역'].str.split()]

noh_by_area = data.groupby('단지명', as_index=False).agg({'전용면적별세대수':'sum'})
noh_by_area.columns = ['단지명', '총세대수']
data = pd.merge(left=data, right=noh_by_area, how='left', on='단지명')
data.drop(['총세대수_x'], axis=1, inplace=True)
data.rename(columns={'총세대수_y':'총세대수'}, inplace=True)

test_wo_dup = test.drop_duplicates(subset=['단지코드'])[['단지코드', '지역', '총세대수']].reset_index(drop=True)
idx = np.column_stack([test.drop_duplicates(subset=['단지코드'])[['지역', '총세대수']].reset_index(drop=True), range(len(test_wo_dup))])

test_wo_dup['단지명'] = None
for i, j, k in idx:
    if data.loc[(data['지역'] == i) & (data['총세대수'] == j), '단지명'].nunique() == 1:
        test_wo_dup.iloc[k, test_wo_dup.columns.get_loc('단지명')] = data.loc[(data['지역'] == i) & (data['총세대수'] == j), '단지명'].unique()[0]

null = test_wo_dup[test_wo_dup['단지명'].isnull()].reset_index(drop=True)
data_hhset = data[['단지명', '전용면적']].groupby(['단지명'])['전용면적'].apply(list)
data_hhset = pd.DataFrame(data_hhset)
data_hhset = pd.merge(data_hhset, data[['단지명', '지역', '총세대수']], on='단지명')
data_hhset = data_hhset[['단지명', '지역', '총세대수', '전용면적']]
data_hhset = data_hhset.drop_duplicates(['단지명']).reset_index(drop=True)

idx = null['단지코드'].unique()
for i in range(len(idx)):
    areaset_test = set(test.loc[test['단지코드'] == idx[i], '전용면적'])
    region = null.loc[i, '지역']
    total_hh2 = null.loc[i,'총세대수']
    same = data_hhset.loc[(data_hhset['지역'] == region) & (data_hhset['총세대수'] == total_hh2)].reset_index(drop=True)
    diff = []
    if len(same.index) != 0:
        for j in range(len(same)):
            areaset_data = set(same.loc[j, '전용면적'])
            diff.append(len(areaset_test - areaset_data))
        null.loc[i, '단지명'] = same.loc[np.argmin(diff), '단지명']


for i in range(len(test_wo_dup)):
    if test_wo_dup.iloc[i, test_wo_dup.columns.get_loc('단지명')] == None:
        test_wo_dup.iloc[i, test_wo_dup.columns.get_loc('단지명')] = null.loc[null['단지코드'] == test_wo_dup.iloc[i, test_wo_dup.columns.get_loc('단지코드')], '단지명']
    else:
        pass

test = pd.merge(test, test_wo_dup[['단지코드', '단지명']], how='left', on='단지코드')

code_name = test['단지코드'].unique()
for i in range(len(code_name)):
    dat = test.loc[test['단지코드'] == code_name[i],:]
    if (dat.shape[0] != dat.단지명.isnull().sum()) & (dat.shape[0] != dat.단지명.notnull().sum()):
        test.loc[test['단지코드'] == code_name[i],'단지명'] = dat.loc[dat['단지명'].notnull(),'단지명']
    else:
        pass

test.reset_index(drop=True, inplace=True)

print(round(test.loc[test['단지명'].isnull(), '단지코드'].nunique()/test['단지코드'].nunique(), 2)*100)

7.000000000000001


In [28]:
# '자격유형', '공급유형' 변수 재생성 (Major Voting)
max_code = test[['단지코드', '전용면적별세대수']].groupby(['단지코드']).max()
max_code = pd.merge(max_code, test, on=['단지코드', '전용면적별세대수'])

code_name = test['단지코드'].unique()
qualify = test['자격유형'].unique()
result_qualify = pd.DataFrame(columns=['단지코드', '자격유형'])
result_qualify['단지코드'] = test['단지코드'].unique()

for i in range(len(code_name)):
    dat= test[test['단지코드']==code_name[i]]
    num_qualified=[0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0]
    
    for j in range(len(qualify)):
        num_qualified[j]=sum(dat.loc[dat['자격유형']==qualify[j],'전용면적별세대수'])
    
    result_qualify.loc[result_qualify['단지코드']== code_name[i],'자격유형']=qualify[num_qualified.index(max(num_qualified))]

code_name= test.단지코드.unique()
supply= test.공급유형.unique()
result_supply = pd.DataFrame( columns=['단지코드', '공급유형'])
result_supply['단지코드']=test.단지코드.unique()

for i in range(len(code_name)):
    dat= test[test['단지코드']==code_name[i]]
    num_supplied=[0,0,0,0,0, 0,0,0,0,0]
    
    for j in range(len(supply)):
        num_supplied[j]=sum(dat.loc[dat['공급유형']==supply[j],'전용면적별세대수'])

    result_supply.loc[result_supply['단지코드']== code_name[i],'공급유형']=supply[num_supplied.index(max(num_supplied))]

result = pd.merge(result_supply, result_qualify, on='단지코드')
test = pd.merge(test, result, on='단지코드')
test.drop(['공급유형_x', '자격유형_x'], axis=1, inplace=True)
test.rename(columns={'공급유형_y':'공급유형', '자격유형_y':'자격유형'}, inplace=True)

In [29]:
test.reset_index(inplace=True, drop=True)

In [30]:
# '임대료' Imputation
from impyute.imputation.cs import mice
test['임대료']=test['임대료'].astype('float64')
test_num = test.loc[:,test.dtypes!=object]
test_imp = mice(test_num.values)
test_imp = pd.DataFrame(test_imp)
test_imp.columns = test_num.columns
test['임대료'] = test_imp['임대료']

In [31]:
# '소형세대', '중형세대', '대형세대' 변수 생성
size = pd.DataFrame(columns=['단지코드','소형세대', '중형세대','대형세대'])
size['단지코드'] = test.단지코드.unique()
code = size['단지코드']

for i in range(len(code)):
    house = test[test['단지코드'] == code[i]]
    size.loc[i, '소형세대'] = sum(house.loc[house['전용면적'] < 40,'전용면적별세대수'])
    size.loc[i, '중형세대'] = sum(house.loc[(house['전용면적'] >= 40) & (house['전용면적'] < 80) ,'전용면적별세대수'])
    size.loc[i, '대형세대'] = sum(house.loc[house['전용면적'] >= 80,'전용면적별세대수'])

size['소형세대'] = size['소형세대'].astype('int64')
size['중형세대'] = size['중형세대'].astype('int64')
size['대형세대'] = size['대형세대'].astype('int64')

test = pd.merge(test, size, on='단지코드', how='left')
test['소형세대'] = test['소형세대'] / test['총세대수']
test['중형세대'] = test['중형세대'] / test['총세대수']
test['대형세대'] = test['대형세대'] / test['총세대수']

In [32]:
# '임대료' 변수 재생성 (전용면적별세대수 고려)
test['임대료'] = test['전용면적별세대수'] * test['임대료']
rental_fee = test.groupby('단지코드', as_index=False).agg({'임대료':'sum'})
test = pd.merge(test, rental_fee, on='단지코드')
test.drop(['임대료_x'], axis=1, inplace=True)
test.rename(columns={'임대료_y':'임대료'}, inplace=True)
test['임대료'] = round(test['임대료']/test['총세대수'], 2)

In [33]:
test['도보 10분거리 내 지하철역 수(환승노선 수 반영)'] = test['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].fillna(0)

In [34]:
test.drop(['임대건물구분', '전용면적', '전용면적별세대수', '임대보증금'], axis=1, inplace=True)

In [35]:
test = test.drop_duplicates().reset_index(drop=True)

In [36]:
test.head()

Unnamed: 0,단지코드,지역,공가수,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,상가수,총세대수,단지명,공급유형,자격유형,소형세대,중형세대,대형세대,임대료
0,C1072,경기도,14.0,0.0,2.0,683.0,0.0,754,"광명역세권(03,택) 2단지",국민임대,H,0.153846,0.846154,0.0,264048.25
1,C1128,경기도,9.0,0.0,3.0,1216.0,0.0,1354,"광명역세권(03,택) 1단지",국민임대,H,0.310192,0.689808,0.0,251290.04
2,C1456,부산광역시,18.0,0.0,16.0,547.0,0.0,619,"부산안락3(06,주)4단지 4단지",국민임대,A,0.399031,0.600969,0.0,224706.19
3,C1840,전라북도,7.0,0.0,3.0,543.0,0.0,593,"익산장신(03,주2) 3단지",국민임대,A,0.564924,0.435076,0.0,120287.54
4,C1332,경기도,11.0,0.0,2.0,1112.0,0.0,1297,성남판교 봇들 6단지 (A18-2BL),국민임대,H,0.327679,0.672321,0.0,284193.62


---

# Baseline

In [37]:
train_dummy = pd.get_dummies(train, columns=['지역'], drop_first=True)

In [38]:
X = train_dummy.drop(['단지코드', '단지명', '등록차량수', '공급유형', '자격유형'], axis=1)
y = train_dummy['등록차량수']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
store = X_test['상가수']
X_train.drop(['상가수'], axis=1, inplace=True)
X_test.drop(['상가수'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [39]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test) + store

In [40]:
from sklearn.metrics import mean_absolute_error as mae
mae(y_test, y_pred)

121.06927927927929