In [None]:
import os #operating system 的缩写 方便操作创建文件夹等系统等级的操作
import tarfile #方便打开压缩包
from six.moves import urllib #方便从url获取数据
import pandas as pd

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/" #获取数据的根目录
HOUSING_PATH = os.path.join("datasets", "housing") #导入数据时候的路径
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz" #数据的url

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True) #创建文件夹
    tgz_path = os.path.join(housing_path, "housing.tgz") #数据文件的本地路径
    urllib.request.urlretrieve(housing_url, tgz_path) #将线上数据导入到本地
    housing_tgz = tarfile.open(tgz_path) #打开压缩包
    housing_tgz.extractall(path=housing_path) #将压缩包所有文件解压缩到指定路径
    housing_tgz.close() #关闭压缩包

In [None]:
fetch_housing_data() #执行这个函数，获取数据文件

In [None]:


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv") #通过路径和文件名获取文件
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()

In [None]:
housing.info()

In [None]:
housing.ocean_proximity.value_counts()

In [None]:
housing.describe()

In [None]:
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize=(20,15))

从以上图中可知：房屋年龄的中位数以及房屋价值的中位数被设置了上限，所以最后一个柱状图会显得十分突兀。解决方案一般为两种：去掉最大值得所有数据或者重新收集这些数据
另外，很多图都skew to the right，在机器学习中我们更偏向于将其统一成正态分布以提高训练的准确度

In [None]:
import numpy as np

In [None]:
def split_train_test(data,test_ratio):
    shuffled_indices = np.random.premutation(len(data))
    test_set_size = int(len(data)*test_ration)
    test_indices= shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
# Divide by 1.5 to limit the number of income categories
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
# Label those above 5 as 5
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

In [None]:
housing.head()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split_housing = StratifiedShuffleSplit(n_splits=1, test_size=.2,random_state=100)
for train_index, test_index in split_housing.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
housing["income_cat"].value_counts()/len(housing)

In [None]:
train_index

In [None]:
for x in (strat_test_set,strat_train_set):
    x.drop(['income_cat'],axis = 1, inplace = True)

In [None]:
housing = strat_train_set.copy()

In [None]:
housing

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude',alpha = 0.1,s=housing['population']/100,label='population'
             ,c='median_house_value',cmap=plt.get_cmap('rainbow'),colorbar=True,figsize=(15,10),sharex=False)
#cmap 是color map，https://matplotlib.org/gallery/color/colormap_reference.html 以上链接有所有color map的形式
plt.legend()

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix

In [None]:
corr_matrix["median_house_value"].sort_values()

In [None]:
import pandas as pd 

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
attributes=["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(10,10))

In [None]:
housing.plot(kind="scatter",y="median_house_value",x='median_income',alpha=0.1)

In [None]:
housing['rooms_per_household']=housing['total_rooms']/housing['households']
housing['bedrooms_per_room']=housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household']=housing['population']/housing['households']


In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values()

In [None]:
housing = strat_train_set.drop("median_house_value",axis=1)
housing_labels = strat_train_set["median_house_value"].copy()#将数据中的label列单独提出方便后续步骤使用

# 数据清理

## missing value：
## 放弃相应的行
## 放弃整列
## 将missing value替换为某个值

### Scikit learn 中有一个很好地教程处理missing values： imputer

In [None]:
from sklearn.impute import SimpleImputer 

In [None]:
imputer = SimpleImputer(strategy="median")

由于中位数只能在numeric的数据上进行计算，所以我们需要一个所有列都是numeric的数据集合

In [None]:
housing_num = housing.drop('ocean_proximity',axis= 1)

使用fit将imputer适配到相应的数据集合

In [None]:
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median()

猜测：以上步骤是为了比较所有数据的中位数是否和imputer得到的中位数相同，这样可以避免新数据输入进来之后中位数有所改变导致的结果不一致

In [None]:
X = imputer.transform(housing_num)

In [None]:
X

In [None]:
housing_tr = pd.DataFrame(X,columns=housing_num.columns)

In [None]:
housing_tr

以上便是把得到的新数组的数据放到dataframe中的所有步骤
如果想一步将imputer的结果带入到数据集合中可以直接使用fit_transform()

In [None]:
housing_transform = imputer.fit_transform(housing_num)

In [None]:
housing_transform

# 对于文本类列，我们更愿意将他们装换成数字，这样更方便来训练数据

### LabelEncoder:

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

In [None]:
housing_cat = housing['ocean_proximity']

In [None]:
housing_cat

In [None]:
# 将以上的二维数据集合通过encoder转为数字的一个数列
housing_cat_encoded = encoder.fit_transform(housing_cat)

In [None]:
housing_cat_encoded

####    如上，便把本为文字的一列转化为以0,1,2,3等数字为代表的类别列
#### encoder.classes_可以查看每个数字对应的类别分别是什么

In [None]:
print(encoder.classes_)

### 但是以上方法带来的问题是，当机器学习碰到类别1 和2 时，算法会认为1 和 2 比1 和4 更为相似。

### 为了避免这种情况，我们引入dummy variable: OneHotEncoder

需要注意的是，如果我们想用fit_tranform来应用上面的function，我们必须将我们想要输入的结果变成二维数组，就如housing_cat一样

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder()

In [None]:
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))

In [None]:
housing_cat_1hot

In [None]:
housing_cat_encoded.reshape(-1,1)

### 值得注意的是，housing_cat_1hot输出的是一个稀疏矩阵，之所以这样使用，是因为当我们有上千个数据行时，每行的类别列都是有一定数量的0和一个1组成，占用内存存储0是很愚蠢的，稀疏矩阵的作用就是只存储非0的元素位置，我们还依然可以按照一个二维数组的方式使用它，如果我们执意想要将它转换为numpy数组，可调用toarray（）

In [None]:
housing_cat_1hot.toarray()

## 使用LebalBinarizer可以一次性完成上述转换

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
encoder = LabelBinarizer(sparse_output = True) #这里的sparse_output是为了保证所得结果是稀疏矩阵，如果去掉，结果便是numpy二维数组

In [None]:
housing_cat_1hot = encoder.fit_transform(housing_cat)

In [None]:
housing_cat_1hot

# --------------↓这里有些问题，记得回头复习钻研↓------------------

## 自定义转换器
其作用便是简化数据清理流程，更快得到有助于机器学习的数据

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin #添加这两个function的目的是更好的调整参数

### TransformerMixin 是为了直接使用fit_transform； 
### BaseEstimator是为了自动调整超参数，在本例中超参数为add_bedrooms_per_room

In [None]:
rooms_ix,bedrooms_ix,population_ix,household_ix = 3,4,5,6

In [None]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    #创建一个具有多功能的class，这样更方便运行更多的pipeline或者function
    def __init__(self,add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:,rooms_ix]/X[:,household_ix]
        population_per_household = X[:,population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household,population_per_household]


In [None]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

一个替代方法：

In [None]:
from sklearn.preprocessing import FunctionTransformer

def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = FunctionTransformer(add_extra_features, validate=False,
                                 kw_args={"add_bedrooms_per_room": False})
housing_extra_attribs = attr_adder.fit_transform(housing.values)

In [None]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

# --------------↑这里有些问题，记得回头复习钻研↑------------------

## 特征缩放
### 为了让数据之间的比例差异不要过大（过大通常会导致性能表现不佳），我们需要把某些特征列进行缩放，注：目标值通常不需要缩放

## 方法一：最小值-最大缩放
又称为归一化，将值缩放为0到1之间的值。
### 为此，sklearn提供了一个转换器：MinMaxScaler,如果你不想范围是0到1，可以通过调整超参数feature_range来更改

## 方法二： 标准化
标准化不会把值绑定到某个范围，对某些算法来说这可能是个问题（如：神经网络的输入范围通常是0到1）。
优点是标准化的方法受异常值的影响更小。原因是异常值影响的平均值会在此方法中消掉。
### 转化器为StandadScaler

## 重要的是，缩放器同其他转化器一样，仅用来拟合训练集，而不是完整的数据集（包括测试集）。

## 使用Pipeline，让数据转换的步骤以正确的顺序执行

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
#对数字类型的数据进行pipeline转换
num_pipline= Pipeline([
    ('imputer',SimpleImputer(strategy="median")),#转换器
    ('attribs_adder',CombinedAttributesAdder()),#转换器
    ('std_scaler',StandardScaler()),#估算器
])

In [None]:
housing_num_tr = num_pipline.fit(housing_num)#将pipeline应用到数据，并且得到相关参数，如median值

In [None]:
housing_num_tr=num_pipline.transform(housing_num)#将相应转换应用到数据，以此得到清理后的数据

以上步骤亦可以直接使用fit_transform转换

## 至此，我们得到了一个可以处理数字数据的pipeline，接下来就是在pipeline中加入LabelBinarizer:
    
    
### 为了方便将数字类数据和label类数据同时囊括到pipeline中去，我们可以利用FeatureUnion来实现这个结合

In [None]:
from sklearn.pipeline import FeatureUnion

In [None]:
housing_num

将所需要的两种数据的表头分别建立list

In [None]:

num_attribs = list(housing_num) 
cat_attribs = ["ocean_proximity"]

## 老版本的转换：

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_attribs)),#数据选择器
    ('imputer',SimpleImputer(strategy="median")),#转换器
    ('attribs_adder',CombinedAttributesAdder()),#转换器
    ('std_scaler',StandardScaler()),#估算器
])

cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_attribs)),
    ('Labelizer',OneHotEncoder(sparse=False)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline),
])

## 新版本的转换：

In [None]:
#Newer:
from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
    ('num',num_pipline,num_attribs),
    ('cat',OneHotEncoder(),cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared

In [None]:
housing_prepared.shape

# 训练模型！

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()

In [None]:
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
some_labels = housing_labels.iloc[:5]

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_data_prepared = full_pipeline.transform(some_data)

In [None]:
lin_reg.predict(some_data_prepared)

### 使用RMSE来测量回归模型：

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
housing_predictions = lin_reg.predict(housing_prepared)

In [None]:
lin_mse = mean_squared_error(housing_labels, housing_predictions)

In [None]:
lin_rmse = np.sqrt(lin_mse)

In [None]:
lin_rmse

## 使用另外一个模型训练

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg = DecisionTreeRegressor()

In [None]:
tree_reg.fit(housing_prepared,housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)

In [None]:
tree_mse = mean_squared_error(housing_labels,housing_predictions)

In [None]:
tree_rmse = np.sqrt(tree_mse)

In [None]:
tree_rmse

## 过度拟合！（overfitting) - 使用cross validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(tree_reg,housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv = 10)

In [None]:
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:",scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
display_scores(rmse_scores)

In [None]:
# 应用于线性回归模型：

In [None]:
lin_scores = cross_val_score(lin_reg,housing_prepared, housing_labels,
                            scoring = "neg_mean_squared_error", cv = 10)

In [None]:
lin_rmse_scores = np.sqrt(-lin_scores)

In [None]:
display_scores(lin_rmse_scores)

### 确定决策树的确是过度拟合了！

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg = RandomForestRegressor()

In [None]:
forest_reg.fit(housing_prepared,housing_labels)

In [None]:
scores = cross_val_score(forest_reg,housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv = 10)

In [None]:
forest_rmse_scores = np.sqrt(-scores)

In [None]:
display_scores(forest_rmse_scores)

# 微调模型！

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    {'bootstrap':[True],'n_estimators':[3,10],'max_features':[2,3,4]},
    
]

In [None]:
forest_reg = RandomForestRegressor()

In [None]:
grid_search = GridSearchCV(forest_reg,param_grid,cv = 5,
                          scoring='neg_mean_squared_error', return_train_score=True)

In [None]:
grid_search.fit(housing_prepared,housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_

In [None]:
for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
    print(np.sqrt(-mean_score),params)

In [None]:
pd.DataFrame(grid_search.cv_results_)

## 随机搜索！

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from scipy.stats import randint

In [None]:
param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    } # 和网格搜索的逻辑类似。我们在这里设置好需要搜索的参数范围



In [None]:
forest_reg = RandomForestRegressor()

In [None]:
rnd_search = RandomizedSearchCV(forest_reg,param_distributions=param_distribs,
                               n_iter= 10, cv = 5, scoring="neg_mean_squared_error")#同样和网格搜索类似

In [None]:
rnd_search.fit(housing_prepared,housing_labels)

In [None]:
#使用同样的方法来看训练结果的分数分别是多少便于比较
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

# 每个属性的相对重要程度

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [None]:
feature_importances

In [None]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

# 通过测试集评估系统

In [None]:
final_model = grid_search.best_estimator_

In [None]:
X_test = strat_test_set.drop("median_house_value", axis = 1) # 将特征列单独取出

In [None]:
y_test = strat_test_set["median_house_value"].copy() #将label列单独取出

In [None]:
X_test_prepared = full_pipeline.transform(X_test)

In [None]:
final_prediction = final_model.predict(X_test_prepared)

In [None]:
final_mse = mean_squared_error(y_test, final_prediction)

In [None]:
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse