# 집 값 예측(linear regression)
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as my
from sklearn import datasets

### 데이터 준비

In [5]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
type(housing)

sklearn.utils._bunch.Bunch

In [6]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [7]:
df = pd.DataFrame(housing.data
                  ,columns=housing.feature_names)

In [9]:
df['target'] = housing.target
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [12]:
df.shape

(20640, 9)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [17]:
df.isna().sum(axis=0)

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [19]:
df.duplicated().sum()

0

In [22]:
X=df.iloc[:,:3]
y=df['target']

In [23]:
X.shape,y.shape

((20640, 3), (20640,))

In [25]:
# 테스트 데이터 분리
# 스케일링 o
# 학습
# RMSE

In [29]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                test_size=0.2,
                                                random_state=2022)
X_train.shape,X_test.shape,y_train.shape

((16512, 3), (4128, 3), (16512,))

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train[:5]

In [32]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train,y_train)

In [43]:
X_test = scaler.transform(X_test)
y_pred = lr.predict(X_test)

In [44]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test,y_pred)

In [45]:
rmse = np.sqrt(mse)
rmse

0.805543218264495

In [76]:
from sklearn.model_selection import cross_val_score
mse = cross_val_score(lr,X_test,y_test,
                scoring='neg_mean_squared_error',
               cv=3)
mse

array([-0.63879692, -0.6664322 , -0.66737045])

In [77]:
np.sqrt(-mse)

array([0.79924772, 0.81635299, 0.81692745])

In [78]:
np.mean(np.sqrt(-mse))

0.8108427192491199

In [95]:
### 결정 트리
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

dr = DecisionTreeRegressor(max_depth=20,random_state=2022)
dr.fit(X_train,y_train)
mse = cross_val_score(dr,X_test,y_test,
                scoring='neg_mean_squared_error',
               cv=3)


In [96]:
dr.score(X_train,y_train)

0.9699453067428655

In [97]:
mse

array([-1.00075419, -1.14469589, -1.03890772])

In [100]:
dr_rmse = np.mean(np.sqrt(-mse))
dr_rmse

1.0298499562530405

In [101]:
### RandomForest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
mse = cross_val_score(rf,X_test,y_test,
                      scoring ='neg_mean_squared_error',
                      cv=3)


In [102]:
mse

array([-0.5821102 , -0.67166476, -0.61068266])

In [105]:
rf_mse = np.mean(np.sqrt(-mse))
rf_mse

0.7879916324893683

In [106]:
from sklearn.svm import SVR

svm = SVR()
svm.fit(X_train,y_train)
mse = cross_val_score(svm,X_test,y_test,
                      scoring = 'neg_mean_squared_error',
                      cv=3)

In [107]:
mse

array([-0.5859679 , -0.63511098, -0.58709683])

In [108]:
svm_rmse = np.mean(np.sqrt(-mse))

In [109]:
svm_rmse

0.7762154482015547

In [110]:
# regulaizer l1 l2
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [118]:
l1 = Lasso()
l1.fit(X_train,y_train)

In [119]:
good!

0.2852942728825987