# 問題 : 波士頓房價預測
Kaggle : https://www.kaggle.com/c/house-prices-advanced-regression-techniques

Kaggle實戰規則說明

解題步驟:

1.下載資料集

2.房價因素與資料欄位研究

3.特徵選取

4.資料正規化

5.選擇模型

6.預測結果合成submission格式

7.上傳答案看分數排名

# 讀取訓練集 / 測試集資料

In [1]:
import pandas as pd


# 讀取資料
df_train = pd.read_csv('/content/train.csv',sep=',')
df_test = pd.read_csv('/content/test.csv',sep=',')

print("df_train : ",df_train.shape)
print("df_test  : ",df_test.shape)


df_train :  (1460, 81)
df_test  :  (1459, 80)


# 資料查看並資料清洗

In [0]:
# 定義function方便使用
def show_lose_data(df):
  na_column = df.isnull().sum()
  na_column = na_column[na_column > 0].sort_values(ascending=False)
  na_rate = na_column / df.shape[0]
  na_type = df[na_column.index].dtypes
  return pd.concat([na_column, na_rate, na_type], axis=1, keys=['count','rate','type'])


In [0]:
# 拿掉Id方便處理資料
X_train = df_train.drop('Id', axis=1)
X_test = df_test.drop('Id', axis=1)
# all_data = df_all.drop('Id', axis=1)

# 取訓練集的目標值
target = df_train.SalePrice

# 拿掉預測的列
X_train.drop('SalePrice', inplace=True, axis=1)

In [4]:
show_lose_data(X_train)

Unnamed: 0,count,rate,type
PoolQC,1453,0.995205,object
MiscFeature,1406,0.963014,object
Alley,1369,0.937671,object
Fence,1179,0.807534,object
FireplaceQu,690,0.472603,object
LotFrontage,259,0.177397,float64
GarageYrBlt,81,0.055479,float64
GarageType,81,0.055479,object
GarageFinish,81,0.055479,object
GarageQual,81,0.055479,object


In [5]:
# 拿掉列資料缺失值超過15%
lose_data = ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','LotFrontage']
X_train.drop(lose_data, inplace=True, axis=1)  
X_test.drop(lose_data, inplace=True, axis=1)  

show_lose_data(X_train)

Unnamed: 0,count,rate,type
GarageCond,81,0.055479,object
GarageQual,81,0.055479,object
GarageFinish,81,0.055479,object
GarageYrBlt,81,0.055479,float64
GarageType,81,0.055479,object
BsmtFinType2,38,0.026027,object
BsmtExposure,38,0.026027,object
BsmtFinType1,37,0.025342,object
BsmtCond,37,0.025342,object
BsmtQual,37,0.025342,object


In [0]:
def update_object_column(df):
  na_object_column = df.isnull().sum()
  na_object_column = na_object_column[na_object_column > 0].sort_values(ascending=False)
  object_column = df[na_object_column.index].select_dtypes(include='object').columns.tolist()
  for i in object_column:
    df[i].fillna(value='None',inplace=True)
  
  

In [7]:
# object 類型缺失資料補成None
update_object_column(X_train)
update_object_column(X_test)
  
# na_object_column = train_data.isnull().sum()
# train_object_column = train_object_column[train_object_column > 0].sort_values(ascending=False)
# train_object_column = train_data[train_object_column.index].select_dtypes(include='object').columns.tolist()

# for i in object_column:
#   df_all[i].fillna(value='None',inplace=True)

show_lose_data(X_train)

Unnamed: 0,count,rate,type
GarageYrBlt,81,0.055479,float64
MasVnrArea,8,0.005479,float64


In [0]:
def update_float_column(df):
  na_float_column = df.isnull().sum()
  na_float_column = na_float_column[na_float_column > 0].sort_values(ascending=False)
  float_column = df[na_float_column.index].select_dtypes(include='float64').columns.tolist()
  for i in float_column:
    df[i].fillna(method='ffill',inplace=True)

In [9]:
# float64 類型缺失資料補前一筆不是Nan的值
update_float_column(X_train)
update_float_column(X_test)

show_lose_data(X_test)

Unnamed: 0,count,rate,type


# 資料正規化

In [0]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

In [0]:
onehot_train = pd.get_dummies(X_train)
onehot_test = pd.get_dummies(X_test)

X_train, X_test = onehot_train.align(onehot_test, join='left', axis=1)

my_imputer = SimpleImputer()
X_train = my_imputer.fit_transform(X_train)
X_test = my_imputer.transform(X_test)


# # 正規化
# for i in X_train:
#   X_train[i] = X_train[i].astype(float)
#   X_train[i] = preprocessing.minmax_scale(X_train[i], axis=0, feature_range=(0,1))


# for i in X_test:
#   X_test[i] = X_test[i].astype(float)
#   X_test[i] = preprocessing.minmax_scale(X_test[i], axis=0, feature_range=(0,1))

  

# float64 類型缺失資料補前一筆不是Nan的值
# update_float_column(X_train)
# update_float_column(X_test)


# show_lose_data(X_train)

# show_lose_data(X_test)

In [12]:
print(X_train.shape)

print(X_test.shape)

(1460, 279)
(1459, 279)


# 把訓練集資料，切割出來訓練

# 挑選訓練模型

In [0]:
#使用線性迴歸
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt 
from sklearn.metrics import r2_score

In [15]:
model = LinearRegression()
model.fit(X_train, target)

pred = model.predict(X_test)

print(pred)
# rmse = sqrt(mean_squared_error(y_test, y_pred))
# r2 = r2_score(y_test, y_pred)

# print("RMSE : ",rmse)
# print("R2 Score : ",r2)

[112267.82318677 160626.76225864 189027.80261423 ... 175849.33953765
 111582.41827067 223067.62457415]


In [0]:
# 使用kaggle常勝軍開發的xgb
import xgboost

In [14]:
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train, target)
pred = xgb.predict(X_test)

print(pred)


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[131743.38  148558.62  187524.45  ... 162934.02  119913.914 220587.84 ]


In [16]:

my_submission = pd.DataFrame({'Id':df_test.Id,'SalePrice':pred})

print(my_submission)

my_submission.to_csv('submission.csv', index=False)

        Id      SalePrice
0     1461  131743.375000
1     1462  148558.625000
2     1463  187524.453125
3     1464  195469.093750
4     1465  189594.562500
5     1466  172250.468750
6     1467  170286.906250
7     1468  166102.250000
8     1469  189024.546875
9     1470  124176.093750
10    1471  196588.468750
11    1472   93607.359375
12    1473  103203.414062
13    1474  158766.296875
14    1475  133712.781250
15    1476  378531.937500
16    1477  249078.875000
17    1478  295683.031250
18    1479  215715.421875
19    1480  506980.125000
20    1481  318995.062500
21    1482  207714.015625
22    1483  164575.296875
23    1484  169277.265625
24    1485  168928.843750
25    1486  191307.859375
26    1487  333521.593750
27    1488  238318.468750
28    1489  208399.000000
29    1490  198799.140625
...    ...            ...
1429  2890   86148.960938
1430  2891  131433.156250
1431  2892   45865.554688
1432  2893   84459.421875
1433  2894   48467.894531
1434  2895  330807.937500
1435  2896  

In [0]:
#使用SVM
from sklearn import svm

clf = svm.SVR(kernel="linear")
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE : ",rmse)
print("R2 Score : ",r2)

  y = column_or_1d(y, warn=True)


RMSE :  83790.00679940736
R2 Score :  -0.00611506236048065


In [0]:
#使用sklearn模型選擇地圖找到的演算法Ridge
from sklearn import linear_model

reg = linear_model.Ridge(alpha=.5)
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE : ",rmse)
print("R2 Score : ",r2)

RMSE :  28150.74905881412
R2 Score :  0.8864353647684935


In [0]:
#使用kaggle常勝軍開發的xgb
import xgboost
# Let's try XGboost algorithm to see if we can get better results
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE : ",rmse)
print("R2 Score : ",r2)

RMSE :  26036.627606959475
R2 Score :  0.902852268776017
