In [None]:
# 코드 출처는 "핸즈온 머신러닝(Hands-On Machine Learning)"에서 따왔지만 일정 부분은 원활한 실행을 위해 수정 함.  

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
path = '/kaggle/input/california-housing-prices/housing.csv'
housing = pd.read_csv(path)

In [None]:
housing

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20, 15))
plt.show()

In [None]:
# 훈련 데이터 분리 
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.float64(identifier)) & 0xffffffff < test_ratio * 2**32

In [None]:
def split_train_test_by_id(data, test_ratio, id_col):
    ids = data[id_col]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
housing_with_id = housing.reset_index()
housing

In [None]:
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

In [None]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace = True)

In [None]:
housing["income_cat"].hist(bins = 50, figsize = (20, 15))
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
housing["income_cat"].value_counts() / len(housing)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)

In [None]:
# 훈련 세트 손상 방지를 위한 복사본 
housing = strat_train_set.copy()

In [None]:
housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.4,
            s = housing["population"] / 100, label = "population", figsize = (10,7),
            c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True, sharex = False)

plt.legend()

In [None]:
# standard correlation coeffient
corr_matrix = housing.corr()

In [None]:
# 상관관계의 범위는 [-1, 1]
# 1에 가까우면 양의 상관관계를 가진다는 뜻. -1에 가까우면 음의 상관관계 
# median_house_value는 median_income과 관련이 있음

corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", 
              "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8), alpha = 0.2)

In [None]:
housing.plot(kind = "scatter", x = "median_income", y = "median_house_value", alpha = 0.1)

In [None]:
# 특성 조합
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_household"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)

In [None]:
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
"""
housing.dropna(subset=["total_bedrooms"])   # opt 1
housing.drop("total_bedrooms", axis = 1)    # opt 2
median = housing["total_bedrooms"].median() # opt 3
housing["total_bedrooms"].fillna(median, inplace = True) # opt 4
"""

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = "median")

In [None]:
housing_num = housing.drop("ocean_proximity", axis = 1)

In [None]:
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X = imputer.transform(housing_num)

housing_tr = pd.DataFrame(X, columns = housing_num.columns,
                         index = list(housing.index.values))

In [None]:
housing_cat = housing["ocean_proximity"]
housing_cat.head(10)

In [None]:
# mapping categories: string -> int
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]

In [None]:
housing_categories
"""
<mapping result>
0 -> <1H OCEAN
1 -> NEAR OCEAN
2 -> INLAND
3 -> NEAE BAY
4 -> ISLAND
"""

In [None]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(categories = 'auto')
housing_cat_onehot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_onehot

In [None]:
housing_cat_onehot.toarray()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAtrributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:,rooms_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:,rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
            
attr_adder = CombinedAtrributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 수치형만 다루는 Pipeline
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAtrributesAdder()),
    ('std_scaler', StandardScaler()),
])

# 범주형만 다루는 Pipeline
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder(categories = 'auto')), #CategoricalEncoding은 지원 안 함. OrdinalEncoder나 OneHotEncoder 사용 해야 됨. 
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
# 수치형, 범주형 Pipeline 하나로 합치기
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

In [None]:
# 데이터 훈련 및 평가
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(housing_prepared, housing_labels)

In [None]:
get_data = housing.iloc[:5]
get_label = housing_labels.iloc[:5]
get_data_prepared = full_pipeline.transform(get_data)

print("expect: ", list(get_label))

In [None]:
# 회귀 모델의 RMSE 측정
from sklearn.metrics import mean_squared_error

housing_pred = reg.predict(housing_prepared)
mse = mean_squared_error(housing_labels, housing_pred)
rmse = np.sqrt(mse)
rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_pred = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_pred)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
# K-fold cross-validation
from sklearn.model_selection import cross_val_score

score = cross_val_score(tree_reg, housing_prepared, housing_labels,
                       scoring="neg_mean_squared_error", cv=10)
tree_rmse_score = np.sqrt(-score)

In [None]:
def display_score(score):
    print("Score:", score)
    print("Mean:", score.mean())
    print("Standard deviation:", score.std())

display_score(tree_rmse_score)

In [None]:
# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
    {'n_estimators' : [3, 10, 30], 
     'max_features' : [2, 4, 6, 8],
    },
    
    {'bootstrap' : [False], 
     'n_estimators' : [3, 10], 
     'max_features' : [2, 3, 4],
    },
]

In [None]:
forest_reg = RandomForestRegressor(n_estimators = 10)

grid_search = GridSearchCV(forest_reg, param_grid, cv = 5,
                          scoring = 'neg_mean_squared_error',
                          return_train_score = True)

grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
# evaluate the score
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
# Analyze Error 
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room", "ocean_poximity"]
cat_one_hot_attribs = list(encoder.categories_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse = True)

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis = 1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

In [None]:
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

final_rmse