## Linear Regression 집값 예측
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import my_utils as my

#### 데이터 준비

In [2]:
from sklearn.datasets import fetch_california_housing

house = fetch_california_housing()

In [3]:
type(house)

sklearn.utils._bunch.Bunch

In [4]:
house.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [5]:
df = pd.DataFrame(house.data, columns=house.feature_names)

In [6]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [7]:
df["target"] = house.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [8]:
df.shape

(20640, 9)

In [11]:
# 결측치 확인
df.isna().sum(axis=0)

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [14]:
# 중복치 확인
df.duplicated().sum()

0

#### 데이터 분리

In [15]:
x_data = df[["MedInc", "HouseAge", "AveRooms"]]
y_data = df["target"]

In [16]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=1)

#### 학습

In [20]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

In [24]:
y_pred = lr.predict(x_train)

In [27]:
from sklearn.metrics import mean_squared_error

print("train RMSE :", np.sqrt(mean_squared_error(y_train, y_pred)))

train RMSE : 0.8066315518941563


In [28]:
y_pred = lr.predict(x_test)
print("test RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))

test RMSE : 0.8041478753587123


In [31]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
x_train = ss.fit_transform(x_train)

In [32]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [33]:
y_pred = lr.predict(x_train)
print("train RMSE :", np.sqrt(mean_squared_error(y_train, y_pred)))

train RMSE : 0.8066315518941563


In [34]:
x_test = ss.transform(x_test)



In [35]:
y_pred = lr.predict(x_test)
print("test RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))

test RMSE : 2.2351624561436196


In [38]:
from sklearn.model_selection import cross_val_score
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score

mse = cross_val_score(lr, x_test, y_test, scoring="neg_mean_squared_error", cv=3)
mse

array([-0.66275086, -0.65110352, -0.61006708])

In [41]:
np.mean(np.sqrt(-mse))   # neg_mean_squared_error로 scoring 했으므로 -mse

0.8006909616109018

In [42]:
#### 결정트리
from sklearn.tree import DecisionTreeRegressor

In [43]:
dtr = DecisionTreeRegressor(random_state=1)
dtr.fit(x_train, y_train)

In [44]:
y_pred = dtr.predict(x_train)
print("train RMSE :", np.sqrt(mean_squared_error(y_train, y_pred)))

train RMSE : 2.339875152711495e-16


In [47]:
y_pred = dtr.predict(x_test)
print("test RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))

test RMSE : 1.2782168838845973


In [48]:
#### RnadomForest
from sklearn.ensemble import RandomForestRegressor

In [49]:
rfr = RandomForestRegressor(random_state=1)
rfr.fit(x_train, y_train)

In [50]:
y_pred = rfr.predict(x_train)
print("train RMSE :", np.sqrt(mean_squared_error(y_train, y_pred)))

train RMSE : 0.2916393691435194


In [51]:
y_pred = rfr.predict(x_test)
print("test RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))

test RMSE : 1.1752306034330138


In [52]:
#### SVM
from sklearn.svm import SVR

In [54]:
svr = SVR()
svr.fit(x_train, y_train)

In [56]:
y_pred = svr.predict(x_train)
print("train RMSE :", np.sqrt(mean_squared_error(y_train, y_pred)))

train RMSE : 0.7389831273410845


In [57]:
y_pred = svr.predict(x_test)
print("test RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))

test RMSE : 1.3207346805719453


In [None]:
#### sklearn.linear_model.Lasso
#### sklearn.Linear_model.Ridge