# 집값 예측 (Linear Regression)
---

In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as my

### 데이터 준비

In [61]:
from sklearn.datasets import fetch_california_housing  #fetch를 활용한 함수들은 local에 포함된것이 아닌 인터넷으로 부터 함수를 활용하여 데이터를 뽑아오는 것을 말한다.

housing = fetch_california_housing()
type(housing)

sklearn.utils.Bunch

In [62]:
housing.keys() #keys를 활용하여 housing 데이터가 어떤식으로 구성돼있는지 확인

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [63]:
df = pd.DataFrame(housing.data, columns = housing.feature_names) #housing 데이터를 데이터프레임화해서 만든다
df['target'] = housing.target #마지막 target column도 추가 (여기선 y가 집값을 의미한다)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [64]:
df.shape

(20640, 9)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [66]:
df.isnull().sum(axis=0) #null이 몇개인지 확인

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [67]:
df.duplicated().sum() #중복된 자료 개수 확인

0

In [68]:
# X, y
X = df[['MedInc','HouseAge','AveRooms']]
y = df['target']

In [69]:
# 테스트 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2022)

# 스케일링 x
# 학습
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)


# RMSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_train,y_pred)
rmse = np.sqrt(mse)
rmse

0.8065352829045949

In [70]:
# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 학습
lr.fit(X_train,y_train)
y_pred = lr.predict(X_train)

# RMSE
mse = mean_squared_error(y_train,y_pred)
rmse=np.sqrt(mse)
rmse

0.8065352829045949

In [71]:
from sklearn.model_selection import cross_val_score

#성능 테스트
mse = cross_val_score(lr, X_test, y_test, 
                scoring='neg_mean_squared_error', #neg_ 는 MSE를 음수로 만든다. 이유는 높을수록 좋게 만들게 하기위해서 저렇게 만듦
                cv=3)

#결과값을 보면 앞에 neg_를 붙였기 때문에 음수 값들이 나온다
#추후 계산할땐 양수로 변형하여 계산해야된다.
np.mean(np.sqrt(-mse))

0.8108427192491199

In [None]:
svm.score(X_train, y_train) # 정확도
lr.score(X_train, y_train) # R2 0 ~ 1

# 두 score함수는 모델에 따라 다른 값들을 반환하지만 둘 다 모두 높을수록 좋다. 

In [72]:
# X_train, y_train
# X_test, y_test
# 스케일링이 되어 있는 상태...

In [94]:
### 결정트리
from sklearn.tree import DecisionTreeRegressor
regr = DecisionTreeRegressor(random_state=2022)
regr.fit(X_train,y_train)
y_pred = regr.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
rmse

2.464184106338567e-16

In [92]:
### RandomForest
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=30, max_depth=3)
regr.fit(X_train,y_train)
y_pred = regr.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
rmse

0.798957626570681

In [97]:
### Support Vector Machine
from sklearn.svm import SVR
regr = SVR(C=1.0, epsilon=0.2)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
rmse

0.7366722533833338