In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

#한글폰트 설정
import matplotlib.font_manager as fm
path = 'C:\\Users\\myksh\\AppData\\Local\\Microsoft\\Windows\\Fonts\\NanumGothic.ttf'
# path = 'C:\\Users\\myksh\\AppData\\Local\\Microsoft\\Windows\\Fonts\\NanumSquare.ttf'
font_name = fm.FontProperties(fname=path).get_name()
print(font_name)
plt.rc('font', family=font_name)

plt.rcParams['font.family'] = 'NanumGothic'

#마이너스가 깨질 것을 방지
plt.rcParams['axes.unicode_minus'] = False

NanumGothic


# 데이터 로드 및 전처리

In [6]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [7]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [8]:
#연대 정리해주는 함수
def tail_year(x):
    if 0<=x<10:
        return '00'
    elif 10<=x<20:
        return '10'
    elif 20<=x<30:
        return '20'
    elif 30<=x<40:
        return '30'
    elif 40<=x<50:
        return '40'
    elif 50<=x<60:
        return '50'
    elif 60<=x<70:
        return '60'
    elif 70<=x<80:
        return '70'
    elif 80<=x<90:
        return '80'
    elif 90<=x<100:
        return '90'
def year_processing(x):
    xx = str(x)
    if xx[:2] == '18':
        return '18' + tail_year(int(xx[2:]))
    elif xx[:2] == '19':
        return '19' + tail_year(int(xx[2:]))
    elif xx[:2] == '20':
        return '20' + tail_year(int(xx[2:]))

In [9]:
#연대별로 변경
train['Year Built'] = train['Year Built'].apply(lambda x:year_processing(x))
train['Year Built'] = train['Year Built'].astype(int)

test['Year Built'] = test['Year Built'].apply(lambda x:year_processing(x))
test['Year Built'] = test['Year Built'].astype(int)

In [10]:
#연대별로 정리
train['Year Remod/Add'] = train['Year Remod/Add'].apply(lambda x:year_processing(x))
train['Year Remod/Add'] = train['Year Remod/Add'].astype(int)

test['Year Remod/Add'] = test['Year Remod/Add'].apply(lambda x:year_processing(x))
test['Year Remod/Add'] = test['Year Remod/Add'].astype(int)

In [11]:
#차고 자리 개수와 차고 면적은 의미가 비슷하므로 자리 개수를 drop
train = train.drop('Garage Cars', axis=1)
test = test.drop('Garage Cars', axis=1)

In [12]:
#2207년 데이터 삭제
train = train.drop(train[train['Garage Yr Blt']>=2022].index)

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = ['Exter Qual','Kitchen Qual','Bsmt Qual']
for i in cols:
    train[i] = le.fit_transform(train[i])
    test[i] = le.fit_transform(test[i])

# Modeling

In [14]:
# 대회 규칙의 평가 산식 함수를 그대로 사용
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [15]:
X = train.drop('target', axis=1)
y = train['target']

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((1079, 12), (270, 12))

In [48]:
from sklearn.linear_model import LassoCV, Ridge, LassoLarsCV,ElasticNetCV

lassocv = LassoCV(eps=1e-7) 
ridge = Ridge(alpha=1e-6) 
lassolarscv = LassoLarsCV()
elasticnetcv = ElasticNetCV(eps=1e-15)

In [49]:
lassocv.fit(X_train,y_train)
ridge.fit(X_train,y_train)
lassolarscv.fit(X_train,y_train)
elasticnetcv.fit(X_train,y_train)

ElasticNetCV(eps=1e-15)

In [50]:
lassocv_pred = lassocv.predict(X_test)
ridge_pred = ridge.predict(X_test)
lassolarscv_pred = lassolarscv.predict(X_test)
elasticnetcv_pred = elasticnetcv.predict(X_test)

In [52]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

rf = RandomForestRegressor(n_estimators=1000)
rf.fit(X_train,y_train)

xgb = XGBRegressor(n_estimators=1000)
xgb.fit(X_train,y_train)

gb = GradientBoostingRegressor(n_estimators=1000)
gb.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=1000)

In [53]:
pred_rf = rf.predict(X_test)
pred_xgb = xgb.predict(X_test)
pred_gb = gb.predict(X_test)

In [54]:
final_pred = ((lassocv_pred*0.3) +  (ridge_pred * 0.2) + (lassolarscv_pred * 0.2) + (elasticnetcv_pred * 0.3)) * 0.4 \
                + ((pred_rf*0.5)+(pred_xgb*0.3)+(pred_gb*0.2)) * 0.6

In [55]:
NMAE(y_test, final_pred)

0.10307607413174652

# 예측 및 저장

In [56]:
lassocv_pred = lassocv.predict(test)
ridge_pred = ridge.predict(test)
lassolarscv_pred = lassolarscv.predict(test)
elasticnetcv_pred = elasticnetcv.predict(test)

In [58]:
pred_rf = rf.predict(test)
pred_xgb = xgb.predict(test)
pred_gb = gb.predict(test)

final_pred1 = ((lassocv_pred*0.3) +  (ridge_pred * 0.2) + (lassolarscv_pred * 0.2) + (elasticnetcv_pred * 0.3)) * 0.4 \
                + ((pred_rf*0.5)+(pred_xgb*0.3)+(pred_gb*0.2)) * 0.6

In [59]:
sub = pd.read_csv('./data/sample_submission.csv')
sub['target'] = final_pred1

In [60]:
sub.to_csv('./submission_data/ensemble_submission2.csv', index=False)

- lassocv, ridge, lassolarscv, elasticnetcv, rf, xgb, gb ensemble 
  - BUT 제일 처음 실행한 Base Model : RandamForestRegressor 보다 점수가 좋지 않다.