In [None]:
# 필요한 라이브러리 불러오기
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import warnings

# 컬럼명 변환을 위한 리스트
names = ['parcelid', 'air_conditioning_type', 'architectural_style', 
         'area_basement', 'num_bathroom', 'num_bedroom', 'framing_type',
         'building_quality', 'num_bathroom_calc', 'deck_type',
         'area_firstfloor_finished', 'area_total_calc',
         'area_living_finished', 'perimeter_living',
         'area_total', 'area_firstfloor_unfinished',
         'area_base', 'fips', 'num_fireplace', 'num_fullbath',
         'num_garagecar', 'area_garage', 'hashottuborspa',
         'heating_type', 'latitude', 'longitude',
         'area_lot', 'num_pool', 'area_pools', 'pooltypeid10',
         'pooltypeid2', 'pooltypeid7', 'property_land_use_code',
         'property_land_use_type', 'property_zoning_desc',
         'census_raw_tract_block', 'region_city', 'region_county',
         'region_neighborhood', 'region_zipcode', 'num_room', 'story_type',
         'num_34_bath', 'material_type', 'num_unit',
         'area_patio', 'area_shed', 'build_year',
         'num_stories', 'flag_fireplace', 'tax_assessed_structure_value',
         'tax_assessed_parcel_value', 'tax_assessment_year', 'tax_assessed_land_value',
         'tax_property', 'tax_delinquency_flag', 'tax_delinquency_year',
         'census_tract_block']

# 데이터 불러오기
train_df   = pd.read_csv('/kaggle/input/zillow-prize-1/train_2016_v2.csv', parse_dates=["transactiondate"]);
train_df2 = pd.read_csv('/kaggle/input/zillow-prize-1/train_2017.csv', parse_dates=["transactiondate"]);
prop_df    = pd.read_csv('/kaggle/input/zillow-prize-1/properties_2016.csv', names=names, header=0, low_memory=False);
prop_df2 = pd.read_csv('/kaggle/input/zillow-prize-1/properties_2017.csv', names=names, header=0, low_memory=False)
sample_df  = pd.read_csv('/kaggle/input/zillow-prize-1/sample_submission.csv')


# Convert property float.64 data to float.32 to save memory
for c, dtype in zip(prop_df.columns, prop_df.dtypes):
	if dtype == np.float64:
		prop_df[c] = prop_df[c].astype(np.float32)

# 컬럼 전체 확인용  
pd.options.display.max_columns = 999
warnings.simplefilter(action='ignore')

In [None]:
train_df = pd.concat([train_df, train_df2], axis = 0)
prop_df3 = pd.concat([prop_df, prop_df2], axis = 0)

In [None]:
# 'abs_logerror'와 'transaction_month' 변수 추가
train_df['abs_logerror'] = abs(train_df['logerror'].values)
train_df['transaction_month'] = train_df['transactiondate'].dt.month

sample_df['parcelid'] = sample_df['ParcelId']

# train_df와 prop_df 병합
train_df = pd.merge(train_df, prop_df3, on='parcelid', how='left')
test_df = pd.merge(sample_df, prop_df2, how='left', on='parcelid')

In [None]:
# 각 변수의 결측치 백분율을 구함 
missing_df = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_per'] = missing_df[missing_df['missing_count']>0].missing_count/train_df.shape[0] * 100
missing_df.sort_values(by='missing_per', ascending=False)

In [None]:
# object 변수들을 encoding
#columns = prop_df.select_dtypes(include=object).columns
#obj = prop_df[columns]
#columns = np.array(columns)
#prop_df.drop(columns=columns, inplace=True)

#obj['hashottuborspa'] = obj['hashottuborspa'].fillna(0)
#obj['flag_fireplace'] = obj['flag_fireplace'].fillna(0)
#obj['tax_delinquency_flag'] = obj['tax_delinquency_flag'].fillna(0)

#obj['hashottuborspa'] = obj['hashottuborspa'].astype(int)
#obj['flag_fireplace'] = obj['flag_fireplace'].astype(int)
#obj['tax_delinquency_flag'].replace('Y', 1, inplace=True)

In [None]:
# 결측치가 50%이상인 변수들을 날리고 simple_train_df에 저장
simple_train_df = train_df.copy()
simple_train_df = simple_train_df.drop(missing_df[missing_df['missing_per'] > 50]['column_name'].values, axis=1)

missing_df.columns = ['column_name', 'missing_count', 'missing_per']
missing_df['missing_per'] = missing_df[missing_df['missing_count']>0].missing_count/train_df.shape[0] * 100
missing_df.sort_values(by='missing_per',ascending=False)

In [None]:
#Xgboost를 사용하여 building_quality 결측치 보정

import xgboost as xgb

# simple_train_df에서 null값이 있는 행을 날림
full_data = simple_train_df.dropna()

# null값이 없는 full_data에서 target값인 'building_quality'를 드랍하여 train_x에 저장
train_x = full_data.drop(['building_quality'],axis=1)

# target값인 'building_quality'를 train_y에 저장
train_y = full_data['building_quality']

# Xgboost로 예측하여 보정하려는(simple_train_df에서 building_quality가 null) 데이터 inter_data에 저장
inter_data = simple_train_df[simple_train_df.building_quality.isnull()]

#inter_data에서 target값인 'building_quality'를 드랍하여 inter_data_x에 저장
inter_data_x = inter_data.drop(['building_quality'],axis=1)

# categorical 변수들과 결측치가 큰 'heating_type','num_unit','area_lot' 드랍 
train_x = train_x.drop(['transactiondate', 'property_zoning_desc' ,'property_land_use_code','heating_type','num_unit','area_lot'],axis=1)
inter_data_x = inter_data_x.drop(['transactiondate', 'property_zoning_desc' ,'property_land_use_code','heating_type','num_unit','area_lot'],axis=1)

#inter_data_x에 null이 있는 행 제거
inter_data_x = inter_data_x.dropna()
inter_data_x.info()

# 모델 선언
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=8)

# train_x와 train_y로 모델 학습
model.fit(train_x,train_y)

# 학습된 모델로 inter_data_x를 예측
pred = model.predict(inter_data_x)
pred

In [None]:
# 예측된 pred값을 단순히 반올림하여 building quality를 예측해보았다.
# 아래에서 보듯이 full데이터가 갖고 있는 builing quality 값의 분포와
# 예측된 데이터의 builing quality 값의 분포차이가 컸다.
# 그래서 단순히 예측된 pred를 반올림하는게 아니라 기준치를 정하여 
# 비율이 큰 상위 4개의 값으로 반환할 것이다.(7,4,1,10)
inter_data_x['building_quality'] = np.around(pred)
print(simple_train_df.building_quality.value_counts()/(simple_train_df.shape[0]-simple_train_df.building_quality.isnull().sum()) *100)
print(inter_data_x['building_quality'].value_counts()/inter_data_x.shape[0]*100)

In [None]:
# 'building_quality'에서 비율이 큰 상위 4개의 값으로 반환하기 위해 
# 여러번 돌려보면서 원래 full data와 최대한 비슷한 분포가 나올 수 있도록
# 기준치를 조정하였다.
# 기존데이터 - 7:51%, 4:41%, 1:4%, 10:2% 
# 예측데이터 - 7:53%, 4:42%, 1:4%, 10:0.5% 

building = []

for x in pred:
    # 기준치 조정
    if 5.5<x<=8.5:
        building.append(7)
    elif 3.3<x<=5.5:
        building.append(4)
    elif x<=3.3:
        building.append(1)
    elif 8.5<x:
        building.append(10)    
inter_data_x['building_quality'] = building

print(simple_train_df.building_quality.value_counts()/(simple_train_df.shape[0]-simple_train_df.building_quality.isnull().sum()) *100)
print(inter_data_x['building_quality'].value_counts()/inter_data_x.shape[0]*100)

In [None]:
simple_train_df.columns

In [None]:
# 보정된 'building_quality'값을 simple_train_df로 넣기 위해 index를 'parcelid'로 통일시켜주었다.
inter_data_x.set_index('parcelid',inplace=True)
simple_train_df.set_index('parcelid',inplace=True)

# parcelid가 index가 되었기 때문에 index가 같은 부분에 예측된 building_quality를 보정해주었다.
for x in inter_data_x.index:
    simple_train_df.loc[x,'building_quality'] = inter_data_x.loc[x,'building_quality']
   
# simple_train_df의 index를 리셋해주고 info를 확인하면 'building_quality'의 결측치가 복원된것을 확인할 수 있다.
simple_train_df = simple_train_df.reset_index()
simple_train_df.info()

In [None]:
# num_unit 결측치 복원

full_data = simple_train_df.dropna()
train_x = full_data.drop(['num_unit'],axis=1)
train_y = full_data['num_unit']

inter_data = simple_train_df[simple_train_df.num_unit.isnull()]
inter_data_x = inter_data.drop(['num_unit'], axis=1)

train_x = train_x.drop(['transactiondate', 'property_zoning_desc' ,'property_land_use_code','heating_type','area_lot'],axis=1)
inter_data_x = inter_data_x.drop(['transactiondate', 'property_zoning_desc' ,'property_land_use_code','heating_type','area_lot'],axis=1)
inter_data_x = inter_data_x.dropna()

model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=8)

# train_x와 train_y로 모델 학습
model.fit(train_x,train_y)

# 학습된 모델로 inter_data_x를 예측
pred = model.predict(inter_data_x)
pred

In [None]:
inter_data_x['num_unit'] = np.around(pred)
print(simple_train_df.num_unit.value_counts()/(simple_train_df.shape[0]-simple_train_df.num_unit.isnull().sum()) *100)
print(inter_data_x['num_unit'].value_counts()/inter_data_x.shape[0]*100)

In [None]:
num_unit = []

for x in pred:
    # 기준치 조정
    if x<=1.003:
        num_unit.append(1)
    elif 1.003<x<=2.9:
        num_unit.append(2)
    else :
        num_unit.append(4)
  
inter_data_x['num_unit'] = num_unit

print(simple_train_df.num_unit.value_counts()/(simple_train_df.shape[0]-simple_train_df.num_unit.isnull().sum()) *100)
print(inter_data_x['num_unit'].value_counts()/inter_data_x.shape[0]*100)

In [None]:
inter_data_x.set_index('parcelid',inplace=True)
simple_train_df.set_index('parcelid',inplace=True)
for x in inter_data_x.index:
    simple_train_df.loc[x,'num_unit'] = inter_data_x.loc[x,'num_unit']
simple_train_df = simple_train_df.reset_index()
simple_train_df.info()

In [None]:
# heating_type 결측치 복원

full_data = simple_train_df.dropna()
train_x = full_data.drop(['heating_type'],axis=1)
train_y = full_data['heating_type']
inter_data = simple_train_df[simple_train_df.heating_type.isnull()]
inter_data_x = inter_data.drop(['heating_type'], axis=1)

train_x = train_x.drop(['transactiondate', 'property_zoning_desc' ,'property_land_use_code','area_lot'],axis=1)
inter_data_x = inter_data_x.drop(['transactiondate', 'property_zoning_desc' ,'property_land_use_code','area_lot'],axis=1)
inter_data_x = inter_data_x.dropna()

model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=8)

# train_x와 train_y로 모델 학습
model.fit(train_x,train_y)

# 학습된 모델로 inter_data_x를 예측
pred = model.predict(inter_data_x)
pred

In [None]:
inter_data_x['heating_type'] = np.around(pred)
print(simple_train_df.heating_type.value_counts()/(simple_train_df.shape[0]-simple_train_df.heating_type.isnull().sum()) *100)
print(inter_data_x['heating_type'].value_counts()/inter_data_x.shape[0]*100)

In [None]:
heating_type = []

for x in pred:
    # 기준치 조정
    if x<=3.3:
        heating_type.append(2)
    elif 3.3<x<=5.9 or 6.1<x<=7.3:
        heating_type.append(7)
    elif 7.3<x:
        heating_type.append(24)
    elif 5.9<x<=6.1:
        heating_type.append(6)
  
inter_data_x['heating_type'] = heating_type

print(simple_train_df.heating_type.value_counts()/(simple_train_df.shape[0]-simple_train_df.heating_type.isnull().sum()) *100)
print(inter_data_x['heating_type'].value_counts()/inter_data_x.shape[0]*100)

In [None]:
inter_data_x.set_index('parcelid',inplace=True)
simple_train_df.set_index('parcelid',inplace=True)

for x in inter_data_x.index:
    simple_train_df.loc[x,'heating_type'] = inter_data_x.loc[x,'heating_type']
    
simple_train_df = simple_train_df.reset_index()
simple_train_df.info()

In [None]:
missing_df = simple_train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_per'] = missing_df[missing_df['missing_count']>0].missing_count/train_df.shape[0] * 100
missing_df.sort_values(by='missing_per',ascending=False)

In [None]:
real_train_df = simple_train_df.drop(['logerror', 'abs_logerror', 'property_zoning_desc'], axis=1)
test_df['transaction_month'] = train_df['transactiondate'].dt.month

In [None]:
test_df.columns

In [None]:
from sklearn.model_selection import train_test_split


# 1. catboost

! pip install catboost
from catboost import CatBoostRegressor, Pool

X_train, X_test, y_train, y_test = train_test_split(real_train_df, simple_train_df.logerror, test_size=0.2, random_state=42)

cb_params = {
    'iterations': 400,
    'learning_rate': 0.035,
    'depth': 7,
    'verbose': 20,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 1234
}

model = CatBoostRegressor(**cb_params)
model.fit(X_train, y_train)

pred = model.predict(X_test)
pred