<a href="https://colab.research.google.com/github/stevenkcolin/tensorflow/blob/master/tf_linear_regression_190126.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [0]:
# 加载数据集
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.cn/mledu-datasets/california_housing_train.csv", sep=",")

In [7]:
california_housing_dataframe = california_housing_dataframe.reindex(np.random.permutation(california_housing_dataframe.index)) 
#这里对数据集进行随机化处理，确保不会出现病态排序结果
california_housing_dataframe["median_house_value"] /= 1000.0 #feature scaling 属性压缩，将house_value压缩为以k为单位
california_housing_dataframe #展示数据，这里自动只展示了部分

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
15665,-122.4,40.7,21.0,1774.0,370.0,875.0,354.0,1.7,61.5
16052,-122.5,37.8,52.0,2196.0,280.0,668.0,291.0,10.1,500.0
10894,-120.8,37.4,32.0,2892.0,521.0,1580.0,484.0,3.8,164.5
8849,-118.7,34.3,8.0,4983.0,754.0,2510.0,725.0,6.9,276.5
2437,-117.6,34.0,16.0,3443.0,562.0,2130.0,564.0,5.1,161.4
...,...,...,...,...,...,...,...,...,...
4373,-118.0,33.9,35.0,1337.0,234.0,692.0,235.0,5.1,213.7
14726,-122.2,37.8,52.0,1026.0,180.0,469.0,168.0,2.9,160.0
12359,-121.5,38.6,39.0,2438.0,483.0,1103.0,472.0,2.9,86.6
3907,-118.0,33.7,22.0,2785.0,441.0,1086.0,392.0,7.4,337.4


In [10]:
# 展示一下统计数据
california_housing_dataframe.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207.3
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,116.0
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,15.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119.4
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180.4
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500.0


In [12]:
# Define the input feature: total_rooms.
my_feature = california_housing_dataframe[["total_rooms"]]
my_feature

Unnamed: 0,total_rooms
15665,1774.0
16052,2196.0
10894,2892.0
8849,4983.0
2437,3443.0
...,...
4373,1337.0
14726,1026.0
12359,2438.0
3907,2785.0


In [0]:
# Configure a numeric feature column for total_rooms.
feature_columns = [tf.feature_column.numeric_column("total_rooms")]

## Step 2 定义目标*斜体文本*

In [17]:
# Define the label.
targets = california_housing_dataframe["median_house_value"]
targets

15665    61.5
16052   500.0
10894   164.5
8849    276.5
2437    161.4
         ... 
4373    213.7
14726   160.0
12359    86.6
3907    337.4
7218    152.3
Name: median_house_value, Length: 17000, dtype: float64

## Step 3 配置LinearRegressor 

In [0]:
# Use gradient descent as the optimizer for training the model.
my_optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

# Configure the linear regression model with our feature columns and optimizer.
# Set a learning rate of 0.0000001 for Gradient Descent.
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer=my_optimizer
)

## Step 4 定义输入函数

In [0]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a linear regression model of one feature.
  
    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """
  
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

## Step 5: 训练模型

In [0]:
_ = linear_regressor.train(
    input_fn = lambda:my_input_fn(my_feature, targets),
    steps=100
)

## Step 6: 评估模型
