In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score

In [3]:
# 엑셀 파일 불러오기
df = pd.read_csv('melb_data.csv')

In [4]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


## 변수별 선택

### 기초 변수 탐색

- Suburb: 부동산이 위치한 지역의 이름을 나타냅니다. 예를 들어 "Abbotsford"나 "Wheelers Hill"과 같은 이름이 될 수 있습니다. 멜버른 내의 지역구(교외 지역 및 도시내 작은 지역)
- Address: 부동산의 정확한 주소를 나타냅니다. 이는 거주지의 번지 및 건물 번호를 포함할 수 있습니다.
- Rooms: 부동산에 있는 총 방의 수를 나타냅니다. 주로 침실, 거실, 욕실 등을 포함합니다.
- Type: 부동산의 유형을 나타냅니다. 주로 주택, 아파트, 단독주택 등이 될 수 있습니다.
    *  h; 독립된 주택. 자체적인 땅과 주변 공간을 가진 구조물_house, cottage, villa, semi,terrace
    * u; 아파트 유닛을 포함한 복합 주거 구조. 다세대 주택, 타운하우스, 플랫 등_unit, duplex
    * t; 일반적으로 서로 붙어 있는 여러층 구조의 주택. 타운 하우스는 일반적으로 한 블록내에 여러개의 유사한 디자인의 집들이 연결되어 있는 형태_townhouse
    * br; bedrooms
    * dev site; development site
    * o res; other residential
- Price: 부동산의 가격을 나타냅니다. 보통 달러(USD) 또는 해당 국가의 통화로 표시됩니다.
- Method: 부동산 거래 방법을 나타냅니다. 예를 들어 "S"는 판매, "SP"는 판매 전, "PI"는 사적 판매 등이 될 수 있습니다.
    * S; 일반적인 매매 방식을 통해 판매된 경우. 공개적으로 시장에 나와 구매자에 의해 직접 구입된 경우_property sold
    * SP; 경매 전에 판매된 경우. 예를 들어, 경매 날짜 전에 판매자와 구매자 간의 합의에 의해 거래가 성사된 경우_property sold prior
    * PI; 경매에서 최소가에 도달하지 못해 판매되지 않은 경우. 부동산은 경매 후에도 판매자의 손에 남아있게 됨_property passed in
    * VB; 판매자 또는 판매자의 대리인이 경매에서 입찰한 경우를 의미. 경매 과정에서 가격을 높이기 위해 사용_vendor bid
    * SA; 경매가 끝난 후 판매된 경우. 경매 도중 적절한 가격에 도달하지 못한 후, 경매 이후에 판매가 성사된 경우_sold after auction
    * PN; sold prior not disclosed
    * SN; sold not disclosed
    * NB; no bid
    * W; withdrawn prior to auction
    * SS; sold after auction price not disclosed.
    * N/A; price or highest bid not available.
- SellerG: 부동산을 판매하는 부동산 에이전트나 회사의 이름을 나타냅니다./부동산 거래의 판매자 유형으로 각 부동산 거래의 판매자를 식별하는 데 사용. 개인, 회사, 대리인 등이 있다.
- Date: 부동산 거래가 이루어진 날짜를 나타냅니다.
- Distance: 부동산이 도심이나 중심지로부터의 거리를 나타냅니다. 주로 킬로미터(km) 단위로 표시됩니다.
- Postcode: 부동산이 위치한 지역의 우편번호를 나타냅니다.
- Bedroom2: 부동산에 있는 침실의 수를 나타냅니다.
- Bathroom: 부동산에 있는 욕실의 수를 나타냅니다.
- Car: 부동산에 대한 주차 공간의 수를 나타냅니다.
- Landsize: 부동산의 토지 면적을 나타냅니다. 주로 제곱미터(m^2) 단위로 표시됩니다.
- BuildingArea: 부동산의 건물 면적을 나타냅니다. 주로 제곱미터(m^2) 단위로 표시됩니다.
- YearBuilt: 부동산의 건축 연도를 나타냅니다.
- CouncilArea: 부동산이 속한 지방 자치 단체를 나타냅니다./부동산이 위치한 행정 구역(지방정부 또는 시의회)를 의미.지역 서비스(공원, 도서관, 스포츠시설, 쓰레기 수거 등)과 계획 및 개발 정책, 세금 및 요금, 교육 기관 등과 직접적으로 연결_Governing council for the area
- Lattitude: 부동산의 위도 좌표를 나타냅니다.
- Longtitude: 부동산의 경도 좌표를 나타냅니다.
- Regionname: 부동산이 위치한 지역 또는 지구의 이름을 나타냅니다. General Region (West, North West, North, North east …etc)
- Propertycount: 부동산이 속한 지역의 주택 수 또는 부동산 수를 나타냅니다.

`Car`, `BuildingArea`, `YearBuilt`, `CouncilArea` columns에는 결측값이 있다. 각 column에서 어떻게 결측치를 처리했는지는 다음의 칼럼별 전처리를 참고하라.

우선, `Rooms`, `Price`, `Bedroom2`, `Bathroom`, `Propertycount`는 그대로 사용한다.

이때 `Price`는 예측하고자 하는 변수이므로 최종 학습 데이터에서는 빠져야 할 것이다.

### 위치 정보

위치 정보 column : `Regionname`, `Suburb`, `Address`, `Postcode`, `CouncilArea`, `Lattitude`, `Longttitude`, `Distance`

요약 : Regionname -> CouncilArea -> Suburb의 순으로 작아짐.
* Regionname : dataset에서 가장 넓은 범위. 
* Suburb : 한국에서 '동' 정도의 개념.
* CouncilArea : 한국에서 '기초자치단체'의 개념. 해당 범위를 기준으로 지방 의회 및 지방 자치 단체장이 선출됨.

한편, 
* Postcode와 Address는 각각 우편번호 - 주소로, 구체적인 해당 집의 위치를 나타내는 개념. 
* Lattitude와 Longtitude는 각각 위도 - 경도. 이 순서쌍이 해당 집의 정확한 위치를 나타냄.
* Distance는 해당 집과 주요 업무 지구의 거리를 나타내는 column.


이때, `CouncilArea`는 NA가 일부 존재하는 column이다. 혹시 몰라 멜버른 직할령 같은 것이 있나 찾아봤는데, 당장 Willianstown만 해도 명백히 상위 지방 council이 존재한다.

In [5]:
df.loc[df['CouncilArea'].isnull()]['Suburb']

7584     Brighton East
10797        Reservoir
12213       Aberfeldie
12214           Albion
12215       Alphington
             ...      
13575    Wheelers Hill
13576     Williamstown
13577     Williamstown
13578     Williamstown
13579       Yarraville
Name: Suburb, Length: 1369, dtype: object

또한, `Postcode`는 숫자의 형태를 띠고 있지만 숫자가 아니다. 이는 범주형 변수로 다루어져야 하는데, 다음과 같이 dataset의 수에 비해 해당 코드를 one-hot encoding하기에 충분하지 않으므로 해당 변수를 활용하기 어렵다. 

In [6]:
df['Postcode'].nunique()

198

특히, `Postcode`는 `Lattitude`, `Longtitude`와 상관관계를 가지는 변수이다. 물론, 위도와 경도 모두 계수를 활용하는 등 행위를 하기에 어려운 것은 마찬가지이다. 이에, 다음과 같은 처리법을 생각할 수 있다.

1. 숫자처럼 생각하고 다 때려넣은 뒤 예측 성능을 높이는 데 집중
2. 지역변수 중 특정한 것만 선택하되, 위의 세 가지 변수는 배제하기
3. one-hot encoding을 수행하기

우선, 해당 데이터 전처리에서는 앞서 제시한 여러 지리 정보 변수 중 지방자치단체를 기준으로 one-hot encoding을 수행하고, 나머지 변수는 모두 제거할 예정이다.

#### councilarea 채우기 코드 활용

In [7]:
# suburb의 councilarea의 unique 값이 1인 경우만 채우기

temp_df = df[df['CouncilArea'].isnull()][['Suburb','CouncilArea']]

for i in range(len(temp_df)):
  if df[df.Suburb==temp_df.Suburb.iloc[i]].CouncilArea.nunique()==1:
    temp_df.CouncilArea.iloc[i] = df[df.Suburb==temp_df.Suburb.iloc[i]].CouncilArea.unique()[0]

for i in temp_df.index:
  df.CouncilArea.loc[i] = temp_df.CouncilArea.loc[i]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  temp_df.CouncilArea.iloc[i] = df[df.Suburb==temp_df.Suburb.iloc[i]].CouncilArea.unique()[0]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this

In [8]:
nul_df = df[df['CouncilArea'].isnull()][['Suburb','CouncilArea','Address']]

In [9]:
fill_council = ['Bayside', 'Yarra', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Whittlesea', 'Whittlesea', 'Whittlesea', 'Brimbank', 'Kingston', 'Moreland', 'Nillumbik', 'Port Phillip', 'Darebin', 'Moreland', 'Moonee Valley', 'Moonee Valley', 'Boroondara', 'Boroondara', 'Boroondara', 'Boroondara', 'Boroondara', 'Boroondara', 'Boroondara', 'Banyule', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Moonee Valley', 'Monash', 'Port Phillip', 'Port Phillip', 'Port Phillip', 'Stonnington', 'Stonnington', 'Darebin', 'Hume', 'Whittlesea', 'Whitehorse', 'Kingston', 'Nillumbik', 'Port Phillip', 'Port Phillip', 'Moonee Valley', 'Moonee Valley', 'Stonnington', 'Stonnington', 'Stonnington', 'Stonnington', 'Banyule', 'Bayside', 'Bayside', 'Melton', 'Moonee Valley', 'Moonee Valley', 'Moonee Valley', 'Moonee Valley', 'Whitehorse', 'Monash', 'Port Phillip', 'Port Phillip', 'Port Phillip', 'Darebin', 'Darebin', 'Stonnington', 'Stonnington', 'Boroondara', 'Port Phillip', 'Whittlesea', 'Whittlesea', 'Whitehorse', 'Whitehorse', 'Kingston', 'Kingston', 'Kingston', 'Kingston', 'Kingston', 'Moreland', 'Port Phillip', 'Yarra', 'Yarra', 'Moreland', 'Moreland', 'Yarra', 'Stonnington', 'Stonnington', 'Stonnington', 'Stonnington', 'Banyule', 'Banyule', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Melton', 'Melton', 'Moonee Valley', 'Moonee Valley', 'Whitehorse', 'Whitehorse', 'Monash', 'Melbourne', 'Port Phillip', 'Darebin', 'Darebin', 'Stonnington', 'Boroondara', 'Boroondara', 'Boroondara', 'Mitchell', 'Port Phillip', 'Stonnington', 'Port Phillip', 'Bayside', 'Banyule', 'Whitehorse', 'Kingston', 'Kingston', 'Nillumbik', 'Port Phillip', 'Yarra', 'Moreland', 'Maribyrnong', 'Stonnington', 'Stonnington', 'Boroondara', 'Stonnington', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Melton', 'Melton', 'Moonee Valley', 'Moonee Valley', 'Brimbank', 'Moonee Valley', 'Moonee Valley', 'Melbourne', 'Monash', 'Port Phillip', 'Darebin', 'Darebin', 'Stonnington', 'Boroondara', 'Boroondara', 'Bayside', 'Hobsons Bay', 'Banyule', 'Kingston', 'Kingston', 'Kingston', 'Kingston', 'Kingston', 'Moreland', 'Yarra', 'Melbourne', 'Melbourne', 'Stonnington', 'Banyule', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Brimbank', 'Moonee Valley', 'Brimbank', 'Macedon Ranges', 'Monash', 'Melton City', 'Port Phillip', 'Port Phillip', 'Darebin', 'Darebin', 'Darebin', 'Whitehorse', 'Whitehorse', 'Bayside', 'Bayside', 'Bayside', 'Banyule', 'Whittlesea', 'Banyule', 'Brimbank', 'Whitehorse', 'Whitehorse', 'Whitehorse', 'Whitehorse', 'Whitehorse', 'Kingston', 'Kingston', 'Nillumbik', 'Yarra', 'Yarra', 'Yarra', 'Moreland', 'Boroondara', 'Stonnington', 'Stonnington', 'Banyule', 'Banyule', 'Banyule', 'Banyule', 'Banyule', 'Bayside', 'Bayside', 'Bayside', 'Bayside', 'Moonee Valley', 'Moonee Valley', 'Brimbank', 'Moonee Valley', 'Yarra Ranges', 'Monash', 'Port Phillip', 'Port Phillip', 'Darebin', 'Whitehorse', 'Whitehorse', 'Whitehorse']

In [10]:
nul_df['CouncilArea'] = fill_council
nul_df.head()

Unnamed: 0,Suburb,CouncilArea,Address
7584,Brighton East,Bayside,7 Roberts Ct
12215,Alphington,Yarra,22 Harker St
12240,Brighton East,Bayside,9 Berkeley Gr
12241,Brighton East,Bayside,1/9 Binnie St
12242,Brighton East,Bayside,9 Dunoon Ct


In [11]:
for i in nul_df.index:
  df.CouncilArea.loc[i] = nul_df.CouncilArea.loc[i]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df.CouncilArea.loc[i] = nul_df.CouncilArea.loc[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.CouncilAre

In [12]:
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,...,2.0,2.0,652.0,,1981.0,Monash,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,...,2.0,2.0,333.0,133.0,1995.0,Hobsons Bay,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,...,2.0,4.0,436.0,,1997.0,Hobsons Bay,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,...,1.0,5.0,866.0,157.0,1920.0,Hobsons Bay,-37.85908,144.89299,Western Metropolitan,6380.0


In [13]:
#df['CouncilArea'].nunique()
#df.dropna()

In [14]:
#df = df.dropna(subset = ['CouncilArea'])
filtered_df = pd.concat([df, pd.get_dummies(df['CouncilArea'], drop_first = True)], axis = 1)
filtered_df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Moreland,Nillumbik,Port Phillip,Stonnington,Unavailable,Whitehorse,Whittlesea,Wyndham,Yarra,Yarra Ranges
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,False,False,False,False,False,False,False,False,True,False
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,False,False,False,False,False,False,False,False,True,False
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,False,False,False,False,False,False,False,False,True,False
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,False,False,False,False,False,False,False,False,True,False
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,...,False,False,False,False,False,False,False,False,False,False
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,...,False,False,False,False,False,False,False,False,False,False
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,...,False,False,False,False,False,False,False,False,False,False
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,...,False,False,False,False,False,False,False,False,False,False


### Drop columns 

In [15]:
# 'Date' 열을 날짜 형식으로 변환
filtered_df['Date'] = pd.to_datetime(filtered_df['Date'], format = '%d/%m/%Y')

# 'Year'와 'Month' 열 생성
filtered_df['Year'] = filtered_df['Date'].dt.year
filtered_df['Month'] = filtered_df['Date'].dt.month

# 'Date' 열 삭제
filtered_df.drop(columns = ['SellerG', 'Address', 'Date', 'Year'], inplace = True)

* Year의 경우 2년간의 Data이고, 물가 상승률 및 기준금리가 거의 유사하던 시점이므로 연별 추세는 무시하고 진행함.
* Month의 경우 계절적 요인이 존재할 수 있어 우선 포함함.
* Adress의 경우 데이터를 사용하기 어렵고, Postcode 등 타 columns로 대체할 수 있어 우선 삭제함.
* sellerG의 경우 집값을 예측하는 과정에서 직접 사용할 변수가 아니므로 삭제함.

### `Method`, `Type`

In [16]:
print(filtered_df['Method'].unique())
print(filtered_df['Type'].unique())

['S' 'SP' 'PI' 'VB' 'SA']
['h' 'u' 't']


데이터 설명서를 로드하면 다음과 같이 각 value의 의미가 설명되어 있다.

- Method: 부동산 거래 방법을 나타냅니다. 예를 들어 "S"는 판매, "SP"는 판매 전, "PI"는 사적 판매 등이 될 수 있습니다.
    * S: 일반적인 매매 방식을 통해 판매된 경우. 공개적으로 시장에 나와 구매자에 의해 직접 구입된 경우(낙찰)_property sold
    * SP: 경매 전에 판매된 경우. 예를 들어, 경매 날짜 전에 판매자와 구매자 간의 합의에 의해 거래가 성사된 경우_property sold prior
    * PI: 경매에서 최소가에 도달하지 못해 판매되지 않은 경우. 부동산은 경매 후에도 판매자의 손에 남아있게 됨(유찰)_property passed in
    * VB: 판매자 또는 판매자의 대리인이 경매에서 입찰한 경우를 의미. 경매 과정에서 가격을 높이기 위해 사용_vendor bid
    * SA: 경매가 끝난 후 판매된 경우. 경매 도중 적절한 가격에 도달하지 못한 후, 경매 이후에 판매가 성사된 경우_sold after auction

- 다음 value들은 앞서 본 것과 같이 실제 dataset에는 없다.
    * PN; sold prior not disclosed
    * SN; sold not disclosed
    * NB; no bid
    * W; withdrawn prior to auction
    * SS; sold after auction price not disclosed.
    * N/A; price or highest bid not available.

이에 다음과 같이 one-hot encoding을 시행한다.

In [17]:
filtered_df = pd.concat([filtered_df, pd.get_dummies(df['Method'], drop_first = True), pd.get_dummies(df['Type'], drop_first = True)], axis = 1)
filtered_df = filtered_df.drop(columns=['Suburb', 'Postcode', 'Lattitude', 'Longtitude', 'Regionname', 'CouncilArea', 'Method', 'Type'])

In [18]:
filtered_df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,Wyndham,Yarra,Yarra Ranges,Month,S,SA,SP,VB,t,u
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,,,4019.0,...,False,True,False,12,True,False,False,False,False,False
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,4019.0,...,False,True,False,2,True,False,False,False,False,False
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,4019.0,...,False,True,False,3,False,False,True,False,False,False
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,,,4019.0,...,False,True,False,3,False,False,False,False,False,False
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,4019.0,...,False,True,False,6,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,1245000.0,16.7,4.0,2.0,2.0,652.0,,1981.0,7392.0,...,False,False,False,8,True,False,False,False,False,False
13576,3,1031000.0,6.8,3.0,2.0,2.0,333.0,133.0,1995.0,6380.0,...,False,False,False,8,False,False,True,False,False,False
13577,3,1170000.0,6.8,3.0,2.0,4.0,436.0,,1997.0,6380.0,...,False,False,False,8,True,False,False,False,False,False
13578,4,2500000.0,6.8,4.0,1.0,5.0,866.0,157.0,1920.0,6380.0,...,False,False,False,8,False,False,False,False,False,False


### `Car`

In [19]:
print(df['Car'].unique())
print(filtered_df['Car'].unique())

[ 1.  0.  2.  6.  5.  4.  3.  8.  7.  9. 10. nan]
[ 1.  0.  2.  6.  5.  4.  3.  8.  7.  9. 10. nan]


0이 있으므로 car의 nan은 정보를 파악할 수 없는 영역이다. 따라서 일단 해당 row를 드랍하는 것으로 한다.

In [20]:
filtered_df = filtered_df.dropna(subset = ['Car'])
filtered_df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,Wyndham,Yarra,Yarra Ranges,Month,S,SA,SP,VB,t,u
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,,,4019.0,...,False,True,False,12,True,False,False,False,False,False
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,4019.0,...,False,True,False,2,True,False,False,False,False,False
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,4019.0,...,False,True,False,3,False,False,True,False,False,False
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,,,4019.0,...,False,True,False,3,False,False,False,False,False,False
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,4019.0,...,False,True,False,6,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,1245000.0,16.7,4.0,2.0,2.0,652.0,,1981.0,7392.0,...,False,False,False,8,True,False,False,False,False,False
13576,3,1031000.0,6.8,3.0,2.0,2.0,333.0,133.0,1995.0,6380.0,...,False,False,False,8,False,False,True,False,False,False
13577,3,1170000.0,6.8,3.0,2.0,4.0,436.0,,1997.0,6380.0,...,False,False,False,8,True,False,False,False,False,False
13578,4,2500000.0,6.8,4.0,1.0,5.0,866.0,157.0,1920.0,6380.0,...,False,False,False,8,False,False,False,False,False,False


### `YearBuilt`

In [21]:
filtered_df.loc[df['YearBuilt'] < 1850]

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,Wyndham,Yarra,Yarra Ranges,Month,S,SA,SP,VB,t,u
2079,2,855000.0,1.6,2.0,1.0,1.0,2886.0,122.0,1830.0,4553.0,...,False,True,False,9,True,False,False,False,False,True
9968,3,1200000.0,14.2,3.0,1.0,4.0,807.0,117.0,1196.0,13366.0,...,False,False,False,6,False,False,False,True,False,False


괴상하게도 1196년 지어진 집이 있는데, 이 집의 주소를 통해 검색한 결과 해당 집의 건설 연도는 1960년임. 따라서 해당 값을 대체하기로 결정함.

In [22]:
filtered_df.loc[9968, 'YearBuilt'] = 1960.0
filtered_df.loc[9968, 'YearBuilt'] 

1960.0

다음으로, 해당 column은 지나치게 많은 NA를 가지고 있다. 

In [23]:
df.loc[df['YearBuilt'].isna()]

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
5,Abbotsford,129 Charles St,2,h,941000.0,S,Jellis,7/05/2016,2.5,3067.0,...,1.0,0.0,181.0,,,Yarra,-37.80410,144.99530,Northern Metropolitan,4019.0
8,Abbotsford,6/241 Nicholson St,1,u,300000.0,S,Biggin,8/10/2016,2.5,3067.0,...,1.0,1.0,0.0,,,Yarra,-37.80080,144.99730,Northern Metropolitan,4019.0
10,Abbotsford,411/8 Grosvenor St,2,u,700000.0,VB,Jellis,12/11/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra,-37.81100,145.00670,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13566,Vermont South,9 Winswood Cl,4,h,1250000.0,PI,Philip,26/08/2017,17.2,3133.0,...,2.0,2.0,986.0,,,Whitehorse,-37.84679,145.20051,Eastern Metropolitan,4280.0
13567,Viewbank,149 Graham Rd,5,h,1316000.0,SP,Nelson,26/08/2017,8.9,3084.0,...,3.0,3.0,696.0,,,Banyule,-37.73501,145.08341,Eastern Metropolitan,2698.0
13569,Wantirna South,12 Armagh Cr,4,h,1323000.0,S,Barry,26/08/2017,14.7,3152.0,...,2.0,2.0,551.0,,,Knox,-37.87608,145.22390,Eastern Metropolitan,7082.0
13570,Wantirna South,34 Fewster Dr,3,h,970000.0,S,Barry,26/08/2017,14.7,3152.0,...,2.0,2.0,674.0,,,Knox,-37.88360,145.22805,Eastern Metropolitan,7082.0


직관적으로, 해당 column은 집의 가격을 결정하는 대단히 중요한 변수이지만 이 데이터로 예측하는 것은 위험한데.... 특히 고려할 것은 year의 경우 정수로 떨어지지 않는 경우 그 의미를 알기 어려워진다는 것이다. 우선 선형회귀를 통해 대치해 본다.

In [24]:
# initiate sklearn's linear regression
from sklearn import linear_model
lm = linear_model.LinearRegression()

# 결측치가 없는 데이터로 모델 학습
X_sub1 = filtered_df.dropna(subset=['YearBuilt', 'Rooms', 'Bedroom2', 'Bathroom', 'Car'])
y_sub1 = X_sub1['YearBuilt']
X_sub1 = X_sub1[['Rooms', 'Bedroom2', 'Bathroom', 'Car']]

# 모델 학습
lm.fit(X_sub1, y_sub1)

# 예측할 데이터셋 선택 (BuildingArea의 결측치가 있는 행)
X_to_predict = filtered_df[filtered_df['YearBuilt'].isnull()][['Rooms', 'Bedroom2', 'Bathroom', 'Car']]

# 예측 수행
predicted_YearBuilt = lm.predict(X_to_predict)

# 예측된 값으로 결측치 대체
filtered_df.loc[filtered_df['YearBuilt'].isnull(), 'YearBuilt'] = predicted_YearBuilt

filtered_df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,Wyndham,Yarra,Yarra Ranges,Month,S,SA,SP,VB,t,u
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,,1963.746026,4019.0,...,False,True,False,12,True,False,False,False,False,False
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.000000,4019.0,...,False,True,False,2,True,False,False,False,False,False
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.000000,4019.0,...,False,True,False,3,False,False,True,False,False,False
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,,1967.357294,4019.0,...,False,True,False,3,False,False,False,False,False,False
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.000000,4019.0,...,False,True,False,6,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,1245000.0,16.7,4.0,2.0,2.0,652.0,,1981.000000,7392.0,...,False,False,False,8,True,False,False,False,False,False
13576,3,1031000.0,6.8,3.0,2.0,2.0,333.0,133.0,1995.000000,6380.0,...,False,False,False,8,False,False,True,False,False,False
13577,3,1170000.0,6.8,3.0,2.0,4.0,436.0,,1997.000000,6380.0,...,False,False,False,8,True,False,False,False,False,False
13578,4,2500000.0,6.8,4.0,1.0,5.0,866.0,157.0,1920.000000,6380.0,...,False,False,False,8,False,False,False,False,False,False


In [25]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13518 entries, 0 to 13579
Data columns (total 51 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Rooms              13518 non-null  int64  
 1   Price              13518 non-null  float64
 2   Distance           13518 non-null  float64
 3   Bedroom2           13518 non-null  float64
 4   Bathroom           13518 non-null  float64
 5   Car                13518 non-null  float64
 6   Landsize           13518 non-null  float64
 7   BuildingArea       7101 non-null   float64
 8   YearBuilt          13518 non-null  float64
 9   Propertycount      13518 non-null  float64
 10  Bayside            13518 non-null  bool   
 11  Boroondara         13518 non-null  bool   
 12  Brimbank           13518 non-null  bool   
 13  Cardinia           13518 non-null  bool   
 14  Casey              13518 non-null  bool   
 15  Darebin            13518 non-null  bool   
 16  Frankston          13518 no

### `BuildingArea`, `Landsize`

해당 데이터의 문제는 다음과 같다.

1. `Landsize`나 `BuildingArea`가 0인 row가 일부 존재한다.
2. `BuildingArea`가 꽤나 많은 rows에서 null이다.

해당 문제에 대해 kaggle에서 논의가 이루어지고 이를 수정하는 시도가 이루어진 바 있다. 그러나, 해당 문제를 해결하고자 한 시도가 있는 dataset은 대치를 위해 Price를 활용하였으므로 이를 해당 학습에 활용할 수 없고, 별도의 처리를 거쳐야 한다.

#### Landsize

우선, Landsize가 0인 rows의 Landsize는 BuildingArea를 통해 대체하는 것이 원칙이다. 문제는, Landsize가 0인 것이 반드시 측정 오류가 아닐 수도 있다는 것이다. 다음을 보라.

In [26]:
sum((df.Landsize == 0)&(df.BuildingArea.isna()))

878

In [27]:
sum(df.Landsize == 0)

1939

따라서, 해당 코드를 작성해 두지만 일단은 실행하지 않는다.

In [28]:
# 'Landsize'가 0.0이고 'BuildingArea'가 NaN이 아닌 경우에만 'BuildingArea' 값을 'Landsize'로 할당
#filtered_df.loc[(filtered_df['Landsize'] == 0.0) & (filtered_df['BuildingArea'].notna()), 'Landsize'] = filtered_df['BuildingArea']

#### BuildingArea

In [29]:
filtered_df.loc[filtered_df['BuildingArea'] == 0.0]

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,Wyndham,Yarra,Yarra Ranges,Month,S,SA,SP,VB,t,u
4344,2,841000.0,2.3,2.0,1.0,1.0,215.0,0.0,2000.0,6821.0,...,False,False,False,3,True,False,False,False,True,False
12226,5,1950000.0,9.7,5.0,3.0,2.0,743.0,0.0,1949.0,7809.0,...,False,False,False,9,True,False,False,False,False,False
12249,3,814000.0,12.1,3.0,1.0,2.0,542.0,0.0,1970.0,10175.0,...,False,False,False,9,True,False,False,False,False,False
12395,4,570000.0,20.6,4.0,2.0,2.0,504.0,0.0,2000.0,5833.0,...,False,False,False,9,True,False,False,False,False,False
12412,3,900000.0,7.0,3.0,2.0,2.0,120.0,0.0,2000.0,8870.0,...,False,False,False,9,False,False,False,True,True,False
13040,3,1390000.0,4.6,3.0,2.0,1.0,125.0,0.0,2002.0,7717.0,...,False,False,False,8,True,False,False,False,False,False
13207,3,1205000.0,12.3,3.0,2.0,4.0,622.0,0.0,1960.0,768.0,...,False,False,False,9,False,True,False,False,False,False
13348,5,2800000.0,9.7,5.0,2.0,2.0,1173.0,0.0,1960.0,7809.0,...,False,False,False,8,True,False,False,False,False,False
13370,3,1650000.0,10.3,3.0,2.0,2.0,623.0,0.0,1920.0,6938.0,...,False,False,False,8,False,False,True,False,False,False
13380,4,815000.0,12.1,4.0,2.0,1.0,525.0,0.0,1965.0,10175.0,...,False,False,False,8,True,False,False,False,False,False


다행히도, 이 값이 1 row이므로 우선 Landsize로 대치하여 사용한다.

In [30]:
filtered_df.loc[filtered_df['BuildingArea'] == 0.0, 'BuildingArea'] = filtered_df['Landsize']

다음으로, BuildingArea는 주요 변수이므로 예측을 통해 대치하고자 한다. 직관적으로 해당 값을 추측할 수 있는 변수로 Rooms, Bathroom, Car 등이 존재하므로 이를 통해 예측하여 대치하였다.

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    13580 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [32]:
sum(filtered_df['Landsize'].isna())

0

In [33]:
# initiate sklearn's linear regression
lm = linear_model.LinearRegression()

# 결측치가 없는 데이터로 모델 학습
X_sub2 = filtered_df.dropna(subset=['BuildingArea', 'Rooms', 'Bedroom2', 'Bathroom', 'Car'])
y_sub2 = X_sub2['BuildingArea']
X_sub2 = X_sub2[['Rooms', 'Bedroom2', 'Bathroom', 'Car']]

# 모델 학습
lm.fit(X_sub2, y_sub2)

# 예측할 데이터셋 선택 (BuildingArea의 결측치가 있는 행)
X_to_predict = filtered_df[filtered_df['BuildingArea'].isnull()][['Rooms', 'Bedroom2', 'Bathroom', 'Car']]

# 예측 수행
predicted_BuildingArea = lm.predict(X_to_predict)

# 예측된 값으로 결측치 대체
filtered_df.loc[filtered_df['BuildingArea'].isnull(), 'BuildingArea'] = predicted_BuildingArea

filtered_df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,Wyndham,Yarra,Yarra Ranges,Month,S,SA,SP,VB,t,u
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,72.665790,1963.746026,4019.0,...,False,True,False,12,True,False,False,False,False,False
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.000000,1900.000000,4019.0,...,False,True,False,2,True,False,False,False,False,False
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.000000,1900.000000,4019.0,...,False,True,False,3,False,False,True,False,False,False
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,152.625298,1967.357294,4019.0,...,False,True,False,3,False,False,False,False,False,False
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.000000,2014.000000,4019.0,...,False,True,False,6,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,1245000.0,16.7,4.0,2.0,2.0,652.0,222.410420,1981.000000,7392.0,...,False,False,False,8,True,False,False,False,False,False
13576,3,1031000.0,6.8,3.0,2.0,2.0,333.0,133.000000,1995.000000,6380.0,...,False,False,False,8,False,False,True,False,False,False
13577,3,1170000.0,6.8,3.0,2.0,4.0,436.0,235.735127,1997.000000,6380.0,...,False,False,False,8,True,False,False,False,False,False
13578,4,2500000.0,6.8,4.0,1.0,5.0,866.0,157.000000,1920.000000,6380.0,...,False,False,False,8,False,False,False,False,False,False



### 범주형 데이터 처리

In [34]:
processed_df = pd.concat([filtered_df, pd.get_dummies(filtered_df.Month, drop_first = True)], axis = 1)
processed_df = processed_df.drop(columns='Month')
processed_df.columns = processed_df.columns.astype(str)

In [50]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler
scaler = MinMaxScaler()

data_scaled = scaler.fit_transform(processed_df)
processed_df = pd.DataFrame(data_scaled, columns=processed_df.columns)

In [51]:
mmprocessed_df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,3,4,5,6,7,8,9,10,11,12
0,-1.0,0.853982,-0.985294,-1.0,0.0,-1.0,-0.508457,-0.675419,0.000000,-0.428163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.0,0.197640,-0.985294,-1.0,0.0,-2.0,-0.605708,-0.604510,-2.283781,-0.428163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.831858,-0.985294,0.0,1.0,-2.0,-0.652220,0.190309,-2.283781,-0.428163,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,-0.075221,-0.985294,0.0,1.0,-1.0,-0.736786,0.219698,0.129378,-0.428163,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.030973,-0.985294,0.0,0.0,0.0,-0.681818,0.100752,1.800411,-0.428163,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13513,1.0,0.507375,1.102941,1.0,1.0,0.0,0.442918,1.000916,0.618145,0.138632,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
13514,0.0,0.191740,-0.352941,0.0,1.0,0.0,-0.231501,0.000000,1.119713,-0.031423,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
13515,0.0,0.396755,-0.352941,0.0,1.0,2.0,-0.013742,1.150081,1.191365,-0.031423,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
13516,1.0,2.358407,-0.352941,1.0,0.0,3.0,0.895349,0.268671,-1.567256,-0.031423,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [36]:
processed_df

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,3,4,5,6,7,8,9,10,11,12
0,-1.0,0.853982,-0.985294,-1.0,0.0,-1.0,-0.508457,-0.675419,0.000000,-0.428163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.0,0.197640,-0.985294,-1.0,0.0,-2.0,-0.605708,-0.604510,-2.283781,-0.428163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.831858,-0.985294,0.0,1.0,-2.0,-0.652220,0.190309,-2.283781,-0.428163,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,-0.075221,-0.985294,0.0,1.0,-1.0,-0.736786,0.219698,0.129378,-0.428163,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.030973,-0.985294,0.0,0.0,0.0,-0.681818,0.100752,1.800411,-0.428163,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13513,1.0,0.507375,1.102941,1.0,1.0,0.0,0.442918,1.000916,0.618145,0.138632,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
13514,0.0,0.191740,-0.352941,0.0,1.0,0.0,-0.231501,0.000000,1.119713,-0.031423,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
13515,0.0,0.396755,-0.352941,0.0,1.0,2.0,-0.013742,1.150081,1.191365,-0.031423,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
13516,1.0,2.358407,-0.352941,1.0,0.0,3.0,0.895349,0.268671,-1.567256,-0.031423,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


전반적으로, 제가 타 분석에 비해 '범주형 자료'의 범위를 더 깐깐하게 잡은 측면이 있는 것 같습니다. 그 결과 이후 분석에서 전반적인 스코어가 매우 낮아지는 모습을 보이는데... 이에 대해 추가 논의가 필요하기도 하고 분석을 추가하다 보면 나아질 가능성도 있고, longtitude 등을 df에서 반영하느냐 빼느냐의 문제도 있어서 이 문제에 대해 논의해야 할 것 같습니다.

## 최종 dataframe

In [37]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13518 entries, 0 to 13517
Data columns (total 61 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Rooms              13518 non-null  float64
 1   Price              13518 non-null  float64
 2   Distance           13518 non-null  float64
 3   Bedroom2           13518 non-null  float64
 4   Bathroom           13518 non-null  float64
 5   Car                13518 non-null  float64
 6   Landsize           13518 non-null  float64
 7   BuildingArea       13518 non-null  float64
 8   YearBuilt          13518 non-null  float64
 9   Propertycount      13518 non-null  float64
 10  Bayside            13518 non-null  float64
 11  Boroondara         13518 non-null  float64
 12  Brimbank           13518 non-null  float64
 13  Cardinia           13518 non-null  float64
 14  Casey              13518 non-null  float64
 15  Darebin            13518 non-null  float64
 16  Frankston          135

In [38]:
processed_df.describe()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,...,3,4,5,6,7,8,9,10,11,12
count,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,...,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0,13518.0
mean,-0.060216,0.256337,0.140857,-0.083592,0.53536,-0.389925,0.24442,0.181462,0.024997,0.1493,...,0.050155,0.070203,0.150096,0.135375,0.146619,0.113552,0.133008,0.04076,0.082409,0.044903
std,0.956438,0.943744,0.861999,0.966692,0.69231,0.962634,8.452843,4.437362,1.035992,0.736252,...,0.218273,0.255498,0.357179,0.342136,0.353739,0.317278,0.339596,0.197742,0.274996,0.207099
min,-2.0,-1.20354,-1.352941,-3.0,-1.0,-2.0,-0.935518,-1.79295,-4.791618,-1.06167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-1.0,-0.370206,-0.441176,-1.0,0.0,-1.0,-0.559197,-0.470174,-0.417681,-0.367501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.629794,0.558824,0.0,1.0,0.0,0.440803,0.529826,0.582319,0.632499,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7.0,11.945428,5.720588,17.0,7.0,8.0,914.527484,496.839901,3.045731,2.534532,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Analysis

In [39]:
def adj_R2(real, pred, X) :
    Adj_r2 = 1 - (1 - r2_score(real, pred)) * (len(real) - 1)/(len(real) - X.shape[1]-1)
    return Adj_r2

In [40]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsRegressor

In [41]:
#train-test split
from sklearn.model_selection import train_test_split

y = processed_df['Price']
X = processed_df.drop(columns = ['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [53]:
mmy = mmprocessed_df['Price']
mmX = mmprocessed_df.drop(columns = ['Price'])

mmX_train, mmX_test, mmy_train, mmy_test = train_test_split(mmX, mmy, test_size = 0.2, random_state = 42)

### Linear Regression

In [56]:
linearReg = linear_model.LinearRegression()
linearReg.fit(X_train, y_train)

# 테스트 세트로 예측
y_pred = linearReg.predict(X_test)

# 모델 평가
print(MSE(y_test, y_pred))
print(MAE(y_test, y_pred))
#print(r2_score(y_test, y_pred), adj_R2(y_test, y_pred, X_test))

0.5405565505960909
0.38261520428366647


In [57]:
linearReg = linear_model.LinearRegression()
linearReg.fit(mmX_train, mmy_train)

# 테스트 세트로 예측
mmy_pred = linearReg.predict(mmX_test)

# 모델 평가
print(MSE(mmy_test, mmy_pred))
print(MAE(mmy_test, mmy_pred))
#print(r2_score(y_test, y_pred), adj_R2(y_test, y_pred, X_test))

0.5405565505960909
0.38261520428366647


### LassoCV

In [48]:
linearLassoReg = LassoCV(random_state = 42)
linearLassoReg.fit(X_train, y_train)

# 테스트 세트로 예측
y_pred = linearLassoReg.predict(X_test)

# 모델 평가
print(MSE(y_test, y_pred))
print(MAE(y_test, y_pred))
#print(r2_score(y_test, y_pred), adj_R2(y_test, y_pred, X_test))

0.540329413621414
0.3834113415776813


In [59]:
linearLassoReg = LassoCV(random_state = 42)
linearLassoReg.fit(mmX_train, mmy_train)

# 테스트 세트로 예측
mmy_pred = linearLassoReg.predict(mmX_test)

# 모델 평가
print(MSE(mmy_test, mmy_pred))
print(MAE(mmy_test, mmy_pred))
#print(r2_score(y_test, y_pred), adj_R2(y_test, y_pred, X_test))

0.540329413621414
0.3834113415776813


### K-NN

In [46]:
NNReg_param = {'n_neighbors':[x for x in range(1, 21)]}

NNReg = KNeighborsRegressor()
NNReg_grid = GridSearchCV(NNReg, NNReg_param, cv = 5, refit = True)

NNReg_grid.fit(X_train, y_train)
NNReg = NNReg_grid.best_estimator_
print(NNReg_grid.best_score_, NNReg_grid.best_params_)

0.6743626794638489 {'n_neighbors': 10}


In [49]:
y_pred = NNReg.predict(X_test)
print(MAE(y_test, y_pred))
print(MSE(y_test, y_pred))

0.3328135403575605
0.28024164467957685


In [60]:
NNReg_param = {'n_neighbors':[x for x in range(1, 21)]}

NNReg = KNeighborsRegressor()
NNReg_grid = GridSearchCV(NNReg, NNReg_param, cv = 5, refit = True)

NNReg_grid.fit(mmX_train, mmy_train)
NNReg = NNReg_grid.best_estimator_
print(NNReg_grid.best_score_, NNReg_grid.best_params_)

0.6743626794638489 {'n_neighbors': 10}


In [61]:
mmy_pred = NNReg.predict(mmX_test)
print(MAE(mmy_test, mmy_pred))
print(MSE(mmy_test, mmy_pred))

0.3328135403575605
0.28024164467957685


### Smoothing Splines

In [97]:
from scipy.interpolate import make_smoothing_spline

make_smoothing_spline(X_train, y_train)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### XGBoost

In [120]:
xgbReg_param = {'n_estimators':[50*x for x in range(1, 6)], 'max_depth':[3*x for x in range(1, 11)]}

xgbReg = XGBRegressor()
xgbReg_grid = GridSearchCV(xgbReg, xgbReg_param, cv = 5, refit = True)

xgbReg_grid.fit(X_train, y_train)
xgbReg = xgbReg_grid.best_estimator_
print(xgbReg_grid.best_score_, xgbReg_grid.best_params_)

0.7903899246896147 {'max_depth': 3, 'n_estimators': 250}


In [121]:
y_pred = xgbReg.predict(X_test)

print(MSE(y_test, y_pred))

0.1618506445046498
