# Preprocessing

In [1]:
import numpy as np
import seaborn as sns
from impyute.imputation.cs import mice
import pandas as pd ; pd.options.display.max_rows = 150
import matplotlib as mpl ; mpl.rcParams['axes.unicode_minus'] = False
import matplotlib.pyplot as plt ; plt.rcParams['font.family'] = 'AppleGothic'

In [2]:
### Public Data importing(myhome_xyfill은 크롤링 데이터입니다.)
code_age_gender = pd.read_csv('한국토지주택공사_임대주택 단지별 연령대별 성별정보_20210511.csv', encoding='CP949')
myhome_xyfill = pd.read_csv('myhome_xyfill.csv').drop_duplicates()
data = pd.read_csv('임대주택_단지_조회.csv')
age_gender_info = pd.read_csv('age_gender_info.csv')

### Function
def code_name(df):
    #myhome 데이터의 필요한 부분만 긁어오기
    myhome = myhome_xyfill[['rnAdres','hsmpNm','hshldCo','suplyTyNm','공급면적(전용)','임대보증금','임대료', 'x','y','준공일자']]
    myhome.columns= ['주소', '단지명', '총세대수', '공급유형', '전용면적','임대보증금','임대료','경도','위도','준공일자']
    #공급유형 및 지역 변수 설정
    myhome.loc[myhome['공급유형']=='50년임대', '공급유형'] = '공공임대(50년)'
    myhome.loc[myhome['공급유형']=='10년임대', '공급유형'] = '공공임대(10년)'
    myhome.loc[myhome['공급유형']=='5년임대', '공급유형'] = '공공임대(5년)'
    myhome['지역'] = myhome['주소'].str.split(' ').str[0]
    # 총세대수 -> 전용면적별세대수합 생성
    myhome_grouped = myhome[['주소','단지명','총세대수']].drop_duplicates()
    myhome_grouped = myhome_grouped.groupby(['단지명','주소'], as_index=False).sum()[['단지명','주소','총세대수']]
    myhome = pd.merge(myhome, myhome_grouped, on=['단지명','주소'], how='left')
    myhome.drop(['총세대수_x'], axis=1, inplace=True)
    myhome.rename(columns={'총세대수_y':'전용면적별세대수합'}, inplace=True)
    # First Match: 지역, 전용면적별세대수합, 공급유형, 전용면적 겹치는 것 중 유니크 382단지
    df = df.drop(['전용면적별세대수합'],axis=1)
    df = df.rename(columns={'전용면적별세대수합_myhome':'전용면적별세대수합'})
    home_merged = pd.merge(df, myhome, on=['지역', '전용면적별세대수합', '공급유형', '전용면적'])
    home_unq = home_merged.groupby(['단지코드']).nunique()
    code_unq = home_unq.index[np.where(home_unq['단지명']==1)]
    name_unq = home_merged[home_merged['단지코드'].isin(code_unq)]['단지명'].unique()
    home_unq.reset_index(drop=True, inplace=True)

    #Second Match: 지역, 공급유형, 전용면적, 임대보증금, 임대료 겹치는 것 중 전용면적별세대수 차이가 적은것
    home_unk = pd.merge(df, myhome, on=['지역','공급유형','전용면적','임대보증금','임대료'])
    home_unk = home_unk[~home_unk['단지코드'].isin(code_unq)]
    home_unk = home_unk[~home_unk['단지명'].isin(name_unq)]
    home_unk['전용면적별세대수차']=abs(home_unk['전용면적별세대수합_x']-home_unk['전용면적별세대수합_y'])
    home_unk = home_unk.groupby(['단지코드','단지명','주소','위도','경도','준공일자','전용면적별세대수차']).nunique('전용면적')['전용면적'].reset_index()
    home_unk = home_unk.loc[home_unk.groupby('단지코드').전용면적별세대수차.idxmin()]
    
    first_match = home_merged[home_merged['단지코드'].isin(code_unq)][['단지명','단지코드','주소','위도','경도','준공일자']].drop_duplicates()
    second_match = home_unk[['단지명','단지코드','주소','위도','경도','준공일자']].drop_duplicates()
    match = pd.concat([first_match, second_match],axis=0).reset_index(drop=True)

    # Final Match: 남은 것중에 지역과 공급유형이 동일하고 전용면적별세대수합 차이가 가장 적은것
    for_match = df[~df['단지코드'].isin(match['단지코드'])][['단지코드','지역','공급유형','전용면적별세대수합']].drop_duplicates()
    list_match =[]
  
    for code in for_match['단지코드'].unique():
        for_dict = for_match[for_match['단지코드']==code]
        rg = list(for_dict['지역'].values)
        sp = list(for_dict['공급유형'].values)
        if (sp[0]=='공공임대(분납)'):
            sp = ['공공임대(10년)'] # Myhome에는 공공임대(분납)이 없으므로
        nh = list(for_dict['전용면적별세대수합'].values)[0]
        myhome_match = myhome[(myhome['지역'].isin(rg))&(myhome['공급유형'].isin(sp))]
        myhome_match = list(myhome_match.loc[abs(myhome_match['전용면적별세대수합']-nh).idxmin()][['단지명','주소','위도','경도','준공일자']])
        list_match.append([code]+myhome_match)

    final_match = pd.DataFrame(list_match, columns=['단지코드','단지명','주소','위도','경도','준공일자'])
    match = pd.concat([match, final_match],axis=0).reset_index(drop=True)
    df = pd.merge(df, match, on=['단지코드'])
    df = df.fillna('0')
    return(df)

def total_member(df):
    # age_gender에서 주소, 단지명, 총입주민수, 주소_mod만 추출하기
        # age_gender에서 주소, 단지명, 총입주민수, 주소_mod만 추출하기
    df_address = df[['주소','총세대수','단지명']].drop_duplicates().reset_index(drop=True)
    age_gender = code_age_gender[code_age_gender['주택유형']=='아파트']
    age_gender = age_gender.iloc[:,3:].drop(['주택유형','총세대수'],axis=1) 
    age_gender.rename(columns={'도로명주소':'주소', '주택명':'단지명'}, inplace=True)
    age_gender = age_gender[age_gender['주소'].notnull()]
    age_gender['단지명'] = age_gender['단지명'].str.replace(' ','')
    age_gender['총입주민수'] = age_gender.set_index(['단지명','주소']).apply(sum,axis=1).values
    age_gender = age_gender.groupby(['주소','단지명']).sum().reset_index()
    age_gender.loc[:,age_gender.columns.str.contains('대')] = age_gender.loc[:,
    age_gender.columns.str.contains('대')].mul(1/age_gender['총입주민수'], axis=0)
    age_gender['주소_mod'] = [''.join(elem[:-1]) for elem in age_gender['주소'].str.split()]

    # df에서 주소, 단지명, 주소_mod만 추출하기
    df_address['단지명_origin'] = df_address['단지명']
    df_address['단지명'] = df_address['단지명'].str.replace(' ','')
    df_address['주소_mod'] = [''.join(elem[:-1]) for elem in df_address['주소'].str.split()]
    age_dist = list(age_gender.columns[age_gender.columns.str.contains('대')])
    df_address[age_dist+['총입주민수']] = None
    for i in range(df_address.shape[0]):
        add = df_address['주소'][i]
        cod = df_address['단지명'][i]
        try: 
            df_address.loc[i,age_dist+['총입주민수']] = age_gender.loc[np.where((age_gender['주소']==add)&(age_gender['단지명']==cod))[0][0],age_dist+['총입주민수']]
        except:
            try: 
                add2 = df_address['주소_mod'][i]
                df_address.loc[i,age_dist+['총입주민수']] = age_gender.loc[np.where((age_gender['주소_mod']==add2)&(age_gender['단지명']==cod))[0][0],age_dist+['총입주민수']]
            except:
                try: 
                    df_address.loc[i,age_dist+['총입주민수']] = age_gender.loc[np.where(age_gender['주소']==add)[0][0],age_dist+['총입주민수']]
                except: pass
    # df_check = df_address[df_address.notnull().all(axis=1)]
    #((df_check['총입주민수'] / df_check['총세대수']).astype(float)).describe().round(1) # df_address[df_address.isnull().any(axis=1)] : 1.8
    df_address['지역'] = [elem[0] for elem in df_address['주소'].str.split()]
    for i in range(df_address.shape[0]):
        if ((df_address['총입주민수'][i]!=df_address['총입주민수'][i])|(df_address['총입주민수'][i]==None)):
            df_address['총입주민수'][i] = df_address['총세대수'][i] * 1.8
            reg = df_address['지역'][i]
            df_address.loc[i,age_dist] = age_gender_info.loc[age_gender_info['지역']==reg,age_dist].values[0]
    df_address = df_address.drop(['단지명','총세대수','주소_mod','주소','지역'],axis=1)
    df_address = df_address.rename(columns={'단지명_origin':'단지명'})
    df_address = df_address.drop_duplicates()
    df = pd.merge(df, df_address, on='단지명', how='left').reset_index(drop=True)
    df['총입주민수']=df['총입주민수'].astype(float)
    return(df)


In [3]:
# Train/test data 불러오기 및 오류 데이터 제거
train = pd.read_csv('train.csv')
train_error = ['C2085', 'C1397', 'C2431', 'C1649', 'C1036', 'C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
train = train[~train['단지코드'].isin(train_error)].reset_index(drop=True)
test = pd.read_csv('test.csv')
test_error = ['C2675', 'C2335', 'C1327']
test = test[~test['단지코드'].isin(test_error)].reset_index(drop=True)
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 15) | Test Set: (1008, 14)


In [4]:
# '임대보증금', '임대료'의 '-'를 0으로 변환
train['임대보증금'] = train['임대보증금'].replace('-', 0).astype('float64')
train['임대료'] = train['임대료'].replace('-', 0).astype('float64')
test['임대보증금'] = test['임대보증금'].replace('-', 0).astype('float64')
test['임대료'] = test['임대료'].replace('-', 0).astype('float64')
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 15) | Test Set: (1008, 14)


In [5]:
# '임대보증금', '임대료' NA를 0으로 대체
train.loc[(train['임대건물구분'] == '상가') & (train['임대보증금'].isnull()), '임대보증금'] = 0
train.loc[(train['임대건물구분'] == '상가') & (train['임대료'].isnull()), '임대료'] = 0
test.loc[(test['임대건물구분'] == '상가') & (test['임대보증금'].isnull()), '임대보증금'] = 0
test.loc[(test['임대건물구분'] == '상가') & (test['임대료'].isnull()), '임대료'] = 0
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 15) | Test Set: (1008, 14)


In [6]:
# pd.set_option('display.max_row',30)
# test[test['단지코드']==test[test['임대보증금'].isnull()]['단지코드'].values[0]]

In [7]:
# Test Data의 '임대보증금', '임대료' NA 특정값으로 대체
test.loc[test['임대보증금'].isnull(), '임대보증금'] = [5787000.0, 5787000.0, 11574000.0]
test.loc[test['임대료'].isnull(), '임대료'] = [79980.0, 79980.0, 159960.0]
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 15) | Test Set: (1008, 14)


In [8]:
# sns.countplot('도보 10분거리 내 지하철역 수(환승노선 수 반영)', data=train)
# plt.show()

In [9]:
# NA를 0으로 변환
train.replace(np.nan, 0, inplace=True)
test['도보 10분거리 내 지하철역 수(환승노선 수 반영)'] = test['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].fillna(0)
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 15) | Test Set: (1008, 14)


In [10]:
# Data의 '임대건물구분 == 상가' 데이터 제거
train_age = train[train['임대건물구분'] != '상가'].reset_index(drop=True)
train_age.drop(['임대건물구분'], axis=1, inplace=True)

test_age = test[test['임대건물구분'] != '상가'].reset_index(drop=True)
test_age.drop(['임대건물구분'], axis=1, inplace=True)

In [11]:
# '전용면적별세대수합' 변수 생성
noh_by_area = train.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'}).rename(columns={'전용면적별세대수':'전용면적별세대수합'})
train = pd.merge(left=train, right=noh_by_area, how='left', on='단지코드')
noh_by_area = test.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'}).rename(columns={'전용면적별세대수':'전용면적별세대수합'})
test = pd.merge(left=test, right=noh_by_area, how='left', on='단지코드')
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

# '전용면적별세대수합' 변수 생성
noh_by_area = train_age.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'}).rename(columns={'전용면적별세대수':'전용면적별세대수합'})
train_age = pd.merge(left=train_age, right=noh_by_area, how='left', on='단지코드')
noh_by_area = test_age.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'}).rename(columns={'전용면적별세대수':'전용면적별세대수합'})
test_age = pd.merge(left=test_age, right=noh_by_area, how='left', on='단지코드')
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

## '전용면적별세대수합_myhome' 변수 생성
train_myhome = train[train['공급유형']!='공공분양']
noh_by_area = train_myhome.groupby('단지코드',as_index=False).agg({'전용면적별세대수':'sum'}).rename(columns={'전용면적별세대수':'전용면적별세대수합_myhome'})
train = pd.merge(left=train, right=noh_by_area, how='left', on='단지코드')

test_myhome = test[test['공급유형']!='공공분양']
noh_by_area = test_myhome.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'}).rename(columns={'전용면적별세대수':'전용면적별세대수합_myhome'})
test = pd.merge(left=test, right=noh_by_area, how='left', on='단지코드')
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

## '전용면적별세대수합_myhome' 변수 생성
train_age_myhome = train_age[train_age['공급유형']!='공공분양']
noh_by_area = train_age_myhome.groupby('단지코드',as_index=False).agg({'전용면적별세대수':'sum'}).rename(columns={'전용면적별세대수':'전용면적별세대수합_myhome'})
train_age = pd.merge(left=train_age, right=noh_by_area, how='left', on='단지코드')

test_age_myhome = test_age[test_age['공급유형']!='공공분양']
noh_by_area = test_age_myhome.groupby('단지코드', as_index=False).agg({'전용면적별세대수':'sum'}).rename(columns={'전용면적별세대수':'전용면적별세대수합_myhome'})
test_age = pd.merge(left=test_age, right=noh_by_area, how='left', on='단지코드')
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 16) | Test Set: (1008, 15)
Train Set: (2869, 16) | Test Set: (1008, 15)
Train Set: (2869, 17) | Test Set: (1008, 16)
Train Set: (2869, 17) | Test Set: (1008, 16)


In [12]:
member1, member2 = code_name(train_age), code_name(test_age)
member1, member2 = total_member(member1), total_member(member2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  myhome['지역'] = myhome['주소'].str.split(' ').str[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_address['총입주민수'][i] = df_address['총세대수'][i] * 1.8


In [13]:
age_col = ['단지코드','10대미만(여자)', '10대미만(남자)', '10대(여자)', '10대(남자)',
       '20대(여자)', '20대(남자)', '30대(여자)', '30대(남자)', '40대(여자)', '40대(남자)',
       '50대(여자)', '50대(남자)', '60대(여자)', '60대(남자)', '70대(여자)', '70대(남자)',
       '80대(여자)', '80대(남자)', '90대(여자)', '90대(남자)', '100대(여자)', '100대(남자)',
       '총입주민수']

In [14]:
# '총입주민수' 변수 생성
member = pd.concat([member1[age_col], member2[age_col]])
member = member.drop_duplicates()
member.iloc[:,1:-1]=member.iloc[:,1:-1].mul(member['총입주민수'],axis=0)
member['차량보유입주민수'] = np.sum(member.iloc[:, [8, 9, 10, 11, 12, 14]], axis=1)
member = member[['단지코드', '차량보유입주민수', '총입주민수']]
train = pd.merge(train, member, how='left')
test = pd.merge(test, member, how='left')
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 19) | Test Set: (1008, 18)


In [15]:
# '임대보증금', '임대료' 변수 재생성 (전용면적별세대수 가중 평균)
train['임대보증금'] = round(train['임대보증금'] * train['전용면적별세대수'] / train['전용면적별세대수합'])
rental_fee = train.groupby('단지코드', as_index=False).agg({'임대보증금':'sum'})
train = pd.merge(train, rental_fee, how='left', on='단지코드').drop(['임대보증금_x'], axis=1).rename(columns={'임대보증금_y':'임대보증금'})
train['임대료'] = round(train['임대료'] * train['전용면적별세대수'] / train['전용면적별세대수합'])
rental_fee = train.groupby('단지코드', as_index=False).agg({'임대료':'sum'})
train = pd.merge(train, rental_fee, how='left', on='단지코드').drop(['임대료_x'], axis=1).rename(columns={'임대료_y':'임대료'})
test['임대보증금'] = round(test['임대보증금'] * test['전용면적별세대수'] / test['전용면적별세대수합'])
rental_fee = test.groupby('단지코드', as_index=False).agg({'임대보증금':'sum'})
test = pd.merge(test, rental_fee, how='left', on='단지코드').drop(['임대보증금_x'], axis=1).rename(columns={'임대보증금_y':'임대보증금'})
test['임대료'] = round(test['임대료'] * test['전용면적별세대수'] / test['전용면적별세대수합'])
rental_fee = test.groupby('단지코드', as_index=False).agg({'임대료':'sum'})
test = pd.merge(test, rental_fee, how='left', on='단지코드').drop(['임대료_x'], axis=1).rename(columns={'임대료_y':'임대료'})
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 19) | Test Set: (1008, 18)


In [16]:
# '소형세대' 변수 생성
train['세대구분'] = ['소형세대' if i < 40 else '기타세대' for i in train['전용면적']]
household = train.groupby(['단지코드', '세대구분'],
            as_index=False).agg({'전용면적별세대수':'sum'}).pivot('단지코드','세대구분', '전용면적별세대수').fillna(0).reset_index().drop(['기타세대'], axis=1)
train = pd.merge(train, household, how='left').drop(['세대구분'], axis=1)
train['소형세대'] = train['소형세대'] / train['전용면적별세대수합']

test['세대구분'] = ['소형세대' if i < 40 else '기타세대' for i in test['전용면적']]
household = test.groupby(['단지코드', '세대구분'],
            as_index=False).agg({'전용면적별세대수':'sum'}).pivot('단지코드', '세대구분', '전용면적별세대수').fillna(0).reset_index().drop(['기타세대'], axis=1)
test = pd.merge(test, household, how='left').drop(['세대구분'], axis=1)
test['소형세대'] = test['소형세대'] / test['전용면적별세대수합']
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 20) | Test Set: (1008, 19)


In [17]:
# '전용면적' 변수 재생성 (전용면적별세대수 가중 평균)
train['전용면적'] = round(train['전용면적'] * train['전용면적별세대수'] / train['전용면적별세대수합'])
rental_fee = train.groupby('단지코드', as_index=False).agg({'전용면적':'sum'})
train = pd.merge(train, rental_fee, how='left', on='단지코드').drop(['전용면적_x'], axis=1).rename(columns={'전용면적_y':'전용면적'})
test['전용면적'] = round(test['전용면적'] * test['전용면적별세대수'] / test['전용면적별세대수합'])
rental_fee = test.groupby('단지코드', as_index=False).agg({'전용면적':'sum'})
test = pd.merge(test, rental_fee, how='left', on='단지코드').drop(['전용면적_x'], axis=1).rename(columns={'전용면적_y':'전용면적'})
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 20) | Test Set: (1008, 19)


In [18]:
# '차량보유인구비율' 변수 생성
cars = pd.DataFrame(age_gender_info.iloc[:, [7, 8, 9, 10, 12, 14]].sum(axis=1), columns=['차량보유인구비율'])
cars['지역'] = age_gender_info['지역']
train = pd.merge(train, cars, how='left', on='지역')
test = pd.merge(test, cars, how='left', on='지역')
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 21) | Test Set: (1008, 20)


In [19]:
# '공공임대(10년)', '공공임대(50년)', '공공임대(분납)', '국민임대', '영구임대', '행복주택', '임대상가' 변수 생성
supply_type = train.groupby(['단지코드', '공급유형'], as_index=False).size()
type1 = [1 if '행복주택' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in train['단지코드'].unique()]
type2 = [1 if '영구임대' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in train['단지코드'].unique()]
type3 = [1 if '임대상가' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in train['단지코드'].unique()]
type4 = [1 if '공공임대(10년)' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in train['단지코드'].unique()]
type5 = [1 if '공공임대(50년)' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in train['단지코드'].unique()]
type6 = [1 if '공공임대(분납)' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in train['단지코드'].unique()]
type7 = [1 if '국민임대' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in train['단지코드'].unique()]
supply_type = pd.DataFrame(np.column_stack([train['단지코드'].unique(), type1, type2, type3, type4, type5, type6, type7]),
            columns=['단지코드','행복주택', '영구임대', '임대상가', '공공임대(10년)', '공공임대(50년)', '공공임대(분납)', '국민임대'])
train = pd.merge(train, supply_type, how='left')
supply_type = test.groupby(['단지코드', '공급유형'], as_index=False).size()
type1 = [1 if '행복주택' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in test['단지코드'].unique()]
type2 = [1 if '영구임대' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in test['단지코드'].unique()]
type3 = [1 if '임대상가' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in test['단지코드'].unique()]
type4 = [1 if '공공임대(10년)' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in test['단지코드'].unique()]
type5 = [1 if '공공임대(50년)' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in test['단지코드'].unique()]
type6 = [1 if '공공임대(분납)' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in test['단지코드'].unique()]
type7 = [1 if '국민임대' in np.array(supply_type.loc[supply_type['단지코드'] == i, '공급유형']) else 0 for i in test['단지코드'].unique()]
supply_type = pd.DataFrame(np.column_stack([test['단지코드'].unique(), type1, type2, type3, type4, type5, type6, type7]),
            columns=['단지코드', '행복주택', '영구임대', '임대상가', '공공임대(10년)', '공공임대(50년)', '공공임대(분납)', '국민임대'])
test = pd.merge(test, supply_type, how='left')
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 28) | Test Set: (1008, 27)


In [20]:
# 불필요한 변수 제거
train.drop(['임대건물구분', '공급유형', '전용면적별세대수', '자격유형', '전용면적별세대수합'], axis=1, inplace=True)
test.drop(['임대건물구분', '공급유형', '전용면적별세대수', '자격유형', '전용면적별세대수합'], axis=1, inplace=True)
print(f'Train Set: {train.shape} | Test Set: {test.shape}')

Train Set: (2869, 23) | Test Set: (1008, 22)


---

In [21]:
train_ = train.drop_duplicates().reset_index(drop=True)
test_ = test.drop_duplicates().reset_index(drop=True)
print(f'Train Set: {train_.shape} | Test Set: {test_.shape}')

Train Set: (411, 23) | Test Set: (147, 22)


In [22]:
X_train = train_[['단지내주차면수', '전용면적', '차량보유인구비율', '총세대수', '공가수', '소형세대', '차량보유입주민수',
                  '행복주택', '영구임대', '임대상가', '공공임대(10년)', '국민임대']]
y_train = train_['등록차량수']

In [23]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [24]:
# 피쳐에 2차항 및 상호작용항 추가
poly = PolynomialFeatures(2, include_bias=False)
X_train = poly.fit_transform(X_train)
print(X_train.shape)

(411, 90)


In [25]:
# StandardScaler 적용
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

---

# Modeling

In [26]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso

**Linear Regression**

In [27]:
lr = LinearRegression()
kfold = KFold(n_splits=10, shuffle=True, random_state=11)
score = cross_val_score(lr, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
print(f'평균: {-score.mean()} | 표준편차: {score.std()}')

평균: 92017312376.44437 | 표준편차: 236107957851.66116


**Ridge Regression**

In [28]:
best_alpha = []
for i in range(50):
    rg = Ridge(alpha=i)
    kfold = KFold(n_splits=10, shuffle=True, random_state=11)
    score = cross_val_score(rg, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
    best_alpha.append(-score.mean())
print(f'Best alpha is {np.argmin(best_alpha)}.')

Best alpha is 17.


In [29]:
rg = Ridge(alpha=np.argmin(best_alpha))
kfold = KFold(n_splits=10, shuffle=True, random_state=11)
score = cross_val_score(rg, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
print(f'평균: {-score.mean()} | 표준편차: {score.std()}')

평균: 106.63969241987543 | 표준편차: 10.512571328786798


**Lasso Regression**

In [30]:
best_alpha = []
for i in range(1, 10):
    ls = Lasso(alpha=i)
    kfold = KFold(n_splits=10, shuffle=True, random_state=11)
    score = cross_val_score(ls, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
    best_alpha.append(-score.mean())
print(f'Best alpha is {np.argmin(best_alpha)}.')

Best alpha is 2.


In [31]:
ls = Lasso(np.argmin(best_alpha))
kfold = KFold(n_splits=10, shuffle=True, random_state=11)
score = cross_val_score(ls, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')

In [32]:
print(f'평균: {-score.mean()} | 표준편차: {score.std()}')

평균: 105.54305623701052 | 표준편차: 10.896002954365605


In [33]:
-score

array([ 98.09705968, 108.77315993, 108.41004912, 114.75440486,
       113.72097469, 109.76228991,  84.79259996, 122.31022195,
       104.455613  ,  90.35418926])

In [34]:
ls.fit(X_train, y_train)

Lasso(alpha=2)

---

In [35]:
X_test = test_[['단지내주차면수', '전용면적', '차량보유인구비율', '총세대수', '공가수', '소형세대', '차량보유입주민수',
                  '행복주택', '영구임대', '임대상가', '공공임대(10년)', '국민임대']]

In [36]:
poly = PolynomialFeatures(2, include_bias=False)
X_test = poly.fit_transform(X_test)

In [37]:
X_test = scaler.transform(X_test)

In [38]:
test_['등록차량수'] = ls.predict(X_test)

In [39]:
submission = test_[['단지코드', '등록차량수']].rename(columns={'단지코드':'code', '등록차량수':'num'})
error = pd.DataFrame(np.array([['C2675', 0], ['C2335', 0], ['C1327', 0]]), columns=['code', 'num'])
submission = pd.concat([submission, error]).reset_index(drop=True)
submission.to_csv('submission.csv', index=False)

In [40]:
test_['등록차량수'].describe()

count     147.000000
mean      529.194706
std       309.299604
min        66.947199
25%       291.569998
50%       462.911757
75%       687.724952
max      1769.010567
Name: 등록차량수, dtype: float64

In [41]:
test_[['단지코드', '총세대수', '총입주민수', '공가수', '전용면적', '소형세대', '행복주택', '영구임대', '단지내주차면수', '등록차량수']]

Unnamed: 0,단지코드,총세대수,총입주민수,공가수,전용면적,소형세대,행복주택,영구임대,단지내주차면수,등록차량수
0,C1072,754,2092.0,14.0,48.0,0.153846,0,0,683.0,750.756051
1,C1128,1354,3654.0,9.0,47.0,0.310192,0,0,1216.0,1227.407026
2,C1456,619,1385.0,18.0,45.0,0.399031,0,0,547.0,450.151102
3,C1840,593,1318.0,7.0,43.0,0.564924,0,0,543.0,484.580579
4,C1332,1297,3779.0,11.0,46.0,0.327679,0,0,1112.0,1202.382281
5,C1563,1974,5642.0,15.0,48.0,0.364235,0,0,1696.0,1769.010567
6,C1794,1349,2868.0,25.0,41.0,0.667902,0,0,1098.0,864.589862
7,C1640,533,1690.0,17.0,51.0,0.163227,0,0,470.0,474.566283
8,C1377,470,1031.0,18.0,40.0,0.629787,0,0,384.0,368.481917
9,C2072,353,870.0,6.0,41.0,0.583569,0,0,280.0,310.876784


**Test MAE: ???**