In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('data/train.csv', index_col='ID')
test = pd.read_csv('data/test.csv', index_col='ID')

In [3]:
train['ADDRESS'].head()

ID
0               9 Crumlin Way
1    37 Sissinghurst Crescent
2               24 Tees Court
3          31 Brentham Street
4        46/1 Corkhill Street
Name: ADDRESS, dtype: object

In [4]:
def change_address(ad):
    li_ad = ad.split()
    return (li_ad[-1]).upper()

In [5]:
train['ADDRESS'] = train['ADDRESS'].apply(change_address)
test['ADDRESS'] = test['ADDRESS'].apply(change_address)

In [6]:
train.corr()['PRICE'].abs().sort_values(ascending=False)

PRICE               1.000000
FLOOR_AREA          0.539524
NEAREST_SCH_RANK    0.453937
BATHROOMS           0.386485
CBD_DIST            0.357368
BEDROOMS            0.256909
LONGITUDE           0.183514
POSTCODE            0.162925
BUILD_YEAR          0.148771
GARAGE              0.124276
NEAREST_STN_DIST    0.099370
LAND_AREA           0.046778
LATITUDE            0.046722
NEAREST_SCH_DIST    0.024030
Name: PRICE, dtype: float64

In [7]:
train.head(3)

Unnamed: 0_level_0,ADDRESS,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,WAY,Ridgewood,520000,5,2,4.0,674,266,2007.0,35500,Butler Station,3000,09-2015\r,6030,-31.656206,115.720566,IRENE MCCORMACK CATHOLIC COLLEGE,1.07923,120.0
1,CRESCENT,Landsdale,575000,5,4,2.0,540,301,2014.0,16900,Whitfords Station,8100,07-2018\r,6065,-31.800802,115.867675,LANDSDALE CHRISTIAN SCHOOL,0.667585,
2,COURT,Mindarie,760000,4,2,2.0,781,220,1992.0,33500,Clarkson Station,3500,02-2020\r,6030,-31.68204,115.7026,MINDARIE SENIOR COLLEGE,0.650761,111.0


In [8]:
# SUBURB
# NEAREST_STN

In [9]:
def year_sold(date):
    return int(date[3:-1])

In [10]:
train['YEAR_SOLD'] = train['DATE_SOLD'].apply(year_sold)

In [11]:
test['YEAR_SOLD'] = test['DATE_SOLD'].apply(year_sold)

In [12]:
train.drop('DATE_SOLD', axis=1, inplace=True)
test.drop('DATE_SOLD', axis=1, inplace=True)

In [13]:
train['GARAGE'].fillna(2.0, inplace=True)
test['GARAGE'].fillna(2.0, inplace=True)

In [14]:
train.corr()['BUILD_YEAR'].abs().sort_values(ascending=False)

BUILD_YEAR          1.000000
BATHROOMS           0.345848
CBD_DIST            0.258332
BEDROOMS            0.234275
FLOOR_AREA          0.230423
NEAREST_SCH_RANK    0.151206
PRICE               0.148771
NEAREST_SCH_DIST    0.096881
NEAREST_STN_DIST    0.090911
YEAR_SOLD           0.082095
LONGITUDE           0.063877
LATITUDE            0.045702
GARAGE              0.044824
POSTCODE            0.038833
LAND_AREA           0.009614
Name: BUILD_YEAR, dtype: float64

In [15]:
train.columns

Index(['ADDRESS', 'SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH',
       'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK', 'YEAR_SOLD'],
      dtype='object')

In [16]:
bui = train[['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD', 'BUILD_YEAR']]
bui.head(3)

Unnamed: 0_level_0,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,CBD_DIST,NEAREST_STN_DIST,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,YEAR_SOLD,BUILD_YEAR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,5,2,4.0,674,266,35500,3000,6030,-31.656206,115.720566,1.07923,2015,2007.0
1,5,4,2.0,540,301,16900,8100,6065,-31.800802,115.867675,0.667585,2018,2014.0
2,4,2,2.0,781,220,33500,3500,6030,-31.68204,115.7026,0.650761,2020,1992.0


In [17]:
bui_train = bui[bui['BUILD_YEAR'].notnull()]
bui_test = bui[bui['BUILD_YEAR'].isnull()][['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD', 'BUILD_YEAR']]
bui_X = bui_train[['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD']]
bui_y = bui_train['BUILD_YEAR']

In [18]:
from sklearn.ensemble import HistGradientBoostingRegressor

hgbr = HistGradientBoostingRegressor()

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
hgbr_result = cross_val_score(hgbr, bui_X, bui_y, cv=5)

print('hgbr_result:', hgbr_result)
print('hgbr_mean:', hgbr_result.mean())

hgbr_result: [0.56746857 0.5800927  0.5654447  0.5572853  0.5815074 ]
hgbr_mean: 0.5703597342510573


In [21]:
hgbr.fit(bui_X, bui_y)

HistGradientBoostingRegressor()

In [22]:
bui_pre = hgbr.predict(bui_test[['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD']])

In [23]:
bui_test['BUILD_YEAR'] = bui_pre

In [24]:
train['BUILD_YEAR'].fillna(bui_test['BUILD_YEAR'], inplace=True)
train['BUILD_YEAR'].isnull().sum()

0

In [25]:
bui = test[['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD', 'BUILD_YEAR']]
bui.head(3)

Unnamed: 0_level_0,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,CBD_DIST,NEAREST_STN_DIST,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,YEAR_SOLD,BUILD_YEAR
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
18510,3,2,3.0,2199,159,29600,15600,6073,-31.899705,116.166117,1.341198,2019,1950.0
18511,4,3,2.0,509,341,8200,2000,6152,-32.01862,115.8589,0.882405,2017,2001.0
18512,4,3,2.0,725,251,10500,2700,6154,-32.04209,115.82629,1.639635,2018,1982.0


In [26]:
bui_test = bui[bui['BUILD_YEAR'].isnull()][['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD', 'BUILD_YEAR']]

In [27]:
bui_pre = hgbr.predict(bui_test[['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD']])

In [28]:
bui_test['BUILD_YEAR'] = bui_pre

In [29]:
test['BUILD_YEAR'].fillna(bui_test['BUILD_YEAR'], inplace=True)
test['BUILD_YEAR'].isnull().sum()

0

In [30]:
train['YEAR_DIFF'] = train['YEAR_SOLD'] - train['BUILD_YEAR']
test['YEAR_DIFF'] = test['YEAR_SOLD'] - test['BUILD_YEAR']

In [31]:
train.columns

Index(['ADDRESS', 'SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH',
       'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK', 'YEAR_SOLD', 'YEAR_DIFF'],
      dtype='object')

In [32]:
sch = train[['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD', 'YEAR_DIFF', 'NEAREST_SCH_RANK']]
sch.head()

Unnamed: 0_level_0,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN_DIST,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,YEAR_SOLD,YEAR_DIFF,NEAREST_SCH_RANK
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,5,2,4.0,674,266,2007.0,35500,3000,6030,-31.656206,115.720566,1.07923,2015,8.0,120.0
1,5,4,2.0,540,301,2014.0,16900,8100,6065,-31.800802,115.867675,0.667585,2018,4.0,
2,4,2,2.0,781,220,1992.0,33500,3500,6030,-31.68204,115.7026,0.650761,2020,28.0,111.0
3,2,1,2.0,425,185,1938.0,3500,1500,6016,-31.926046,115.838181,0.484465,2020,82.0,106.0
4,3,2,2.0,546,186,2004.0,12900,800,6159,-32.032079,115.759626,1.86905,2017,13.0,25.0


In [33]:
sch_train = sch[sch['NEAREST_SCH_RANK'].notnull()]
sch_test = sch[sch['NEAREST_SCH_RANK'].isnull()][['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD', 'YEAR_DIFF', 'NEAREST_SCH_RANK']]
sch_X = sch_train[['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD', 'YEAR_DIFF']]
sch_y = sch_train['NEAREST_SCH_RANK']

In [34]:
from sklearn.ensemble import HistGradientBoostingRegressor
hgbr = HistGradientBoostingRegressor()

In [35]:
from sklearn.model_selection import cross_val_score

In [36]:
sch_result = cross_val_score(hgbr, sch_X, sch_y, cv=5)

In [37]:
print(sch_result)
print(sch_result.mean())

[0.92198959 0.92191194 0.92391405 0.92562892 0.91921296]
0.9225314924624837


In [38]:
hgbr.fit(sch_X, sch_y)

HistGradientBoostingRegressor()

In [39]:
sch_pre = hgbr.predict(sch_test[['BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST',
       'NEAREST_STN_DIST', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH_DIST', 'YEAR_SOLD', 'YEAR_DIFF']])

In [40]:
sch_test['NEAREST_SCH_RANK'] = sch_pre

In [41]:
train['NEAREST_SCH_RANK'].fillna(sch_test['NEAREST_SCH_RANK'], inplace=True)
train['NEAREST_SCH_RANK'].isnull().sum()

0

In [42]:
n = set(test['NEAREST_SCH']) - set(train['NEAREST_SCH'])

In [43]:
k = train['NEAREST_SCH_RANK'].mean()

In [44]:
a = train[['NEAREST_SCH', 'NEAREST_SCH_RANK']]['NEAREST_SCH']
b = train[['NEAREST_SCH', 'NEAREST_SCH_RANK']]['NEAREST_SCH_RANK']

In [45]:
a[3]

'ARANMORE CATHOLIC COLLEGE'

In [46]:
ns_dict = {}

for i in range(len(a)):
    ns_dict[a[i]] = b[i]

In [47]:
ns_dict[list(n)[0]] = k

In [48]:
test['NEAREST_SCH_RANK'] = test['NEAREST_SCH'].map(ns_dict)

In [49]:
train.drop('NEAREST_SCH', axis=1, inplace=True)
test.drop('NEAREST_SCH', axis=1, inplace=True)

In [50]:
train.info()

# 'ADDRESS', 'SUBURB', 'NEAREST_STN' object

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18510 entries, 0 to 18509
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ADDRESS           18510 non-null  object 
 1   SUBURB            18510 non-null  object 
 2   PRICE             18510 non-null  int64  
 3   BEDROOMS          18510 non-null  int64  
 4   BATHROOMS         18510 non-null  int64  
 5   GARAGE            18510 non-null  float64
 6   LAND_AREA         18510 non-null  int64  
 7   FLOOR_AREA        18510 non-null  int64  
 8   BUILD_YEAR        18510 non-null  float64
 9   CBD_DIST          18510 non-null  int64  
 10  NEAREST_STN       18510 non-null  object 
 11  NEAREST_STN_DIST  18510 non-null  int64  
 12  POSTCODE          18510 non-null  int64  
 13  LATITUDE          18510 non-null  float64
 14  LONGITUDE         18510 non-null  float64
 15  NEAREST_SCH_DIST  18510 non-null  float64
 16  NEAREST_SCH_RANK  18510 non-null  float6

In [51]:
cat_features = ['ADDRESS', 'SUBURB', 'NEAREST_STN']
cat_features

['ADDRESS', 'SUBURB', 'NEAREST_STN']

In [52]:
for cat_name in cat_features:
    dummy = pd.get_dummies(train[cat_name], prefix=cat_name)
    train = pd.concat([train, dummy], axis=1)
    train.drop(cat_name, axis=1, inplace=True)

In [53]:
for cat_name in cat_features:
    dummy = pd.get_dummies(test[cat_name], prefix=cat_name)
    test = pd.concat([test, dummy], axis=1)
    test.drop(cat_name, axis=1, inplace=True)

In [54]:
for_test = set(train.columns) - set(test.columns)
for_train = set(test.columns) - set(train.columns)

In [55]:
for i in for_test:
    test[i] = 0

In [56]:
for i in for_train:
    train[i] = 0

In [57]:
test.drop('PRICE', axis=1, inplace=True)

In [59]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [60]:
hgbr = HistGradientBoostingRegressor()

In [61]:
from sklearn.model_selection import cross_val_score

y_train = train.pop('PRICE')
X_train = train

In [63]:
hgbr_result = cross_val_score(hgbr, X_train, y_train, cv=5)


print('hgbr_result:', hgbr_result)
print('hgbr_mean:', hgbr_result.mean())

hgbr_result: [0.82354145 0.83909635 0.85338979 0.8470183  0.82910332]
hgbr_mean: 0.8384298428633377


In [64]:
from sklearn.preprocessing import StandardScaler

In [65]:
# 스케일러 생성
scaler = StandardScaler()

# 현재 데이터의 분포 범위 확인
scaler.fit(X_train)

# 데이터 변환
tf_X_train = scaler.transform(X_train)
tf_test = scaler.transform(test)

Feature names must be in the same order as they were in fit.



In [66]:
hgbr_result = cross_val_score(hgbr, tf_X_train, y_train, cv=5)

print('hgbr_result:', hgbr_result)
print('hgbr_mean:', hgbr_result.mean())

hgbr_result: [0.82741144 0.83477257 0.84956793 0.84332299 0.83037694]
hgbr_mean: 0.8370903747762451


In [67]:
# 예측
hgbr.fit(X_train, y_train)
hgbr_pre = hgbr.predict(test)

# 답안지 파일 불러오기
result = pd.read_csv('data/sample_submission.csv')

# 예측값 초기화
result['PRICE'] = hgbr_pre

# df -> csv 저장
result.to_csv('try05/try05_hgbr.csv', index=False)

Feature names must be in the same order as they were in fit.



In [68]:
hgbr2 = HistGradientBoostingRegressor()


# 예측
hgbr2.fit(tf_X_train, y_train)
hgbr2_pre = hgbr2.predict(tf_test)

# 답안지 파일 불러오기
result = pd.read_csv('data/sample_submission.csv')

# 예측값 초기화
result['PRICE'] = hgbr2_pre

# df -> csv 저장
result.to_csv('try05/try05_hgbr2.csv', index=False)