In [None]:
#Petfinder是一家公司北美地區的最大的線上寵物領養網站，目前有來自近14,000個動物收容所和超過315,000個寵物。

In [None]:
#Type：種類（狗、貓）
#Age：年齡
#Breed1：品種
#Gender：性別
#Color1：主色
#Color2：次色
#MaturitySize：寵物成熟的大小
#FurLength：毛髮長度
#Vaccinated：是否已接種疫苗，（Not Sure	表示不確定）
#Sterilized：是否已經絕育，（Not Sure	表示不確定）
#Health：寵物健康狀況
#Fee：收養費用，0表示免費
#Description：寵物的基本資料
#PhotoAmt：該寵物上傳的照片總數
#AdoptionSpeed	：領養速度，0表示當天就被領養、1表示一周內被領養、2表示一個月內被領養、3表示第二跟第三個月還沒被領養、4表示100天之後仍然沒被領養

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from sklearn.model_selection import train_test_split

In [None]:
import pathlib

dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'

tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
                        extract=True, cache_dir='.')
dataset = pd.read_csv(csv_file)
dataset.head(10)

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2
5,Cat,3,Domestic Short Hair,Female,Cream,Gray,Medium,Short,No,No,Healthy,0,This is a stray kitten that came to my house. ...,2,2
6,Cat,12,Domestic Long Hair,Male,Black,No Color,Medium,Long,No,Not Sure,Healthy,300,anyone within the area of ipoh or taiping who ...,3,1
7,Cat,2,Domestic Medium Hair,Female,Gray,No Color,Medium,Medium,No,No,Healthy,0,"healthy and active, feisty kitten found in nei...",6,1
8,Cat,12,Domestic Medium Hair,Female,Black,White,Medium,Medium,Not Sure,Not Sure,Healthy,0,"Very manja and gentle stray cat found, we woul...",2,4
9,Dog,2,Mixed Breed,Male,Black,Brown,Medium,Short,No,No,Healthy,0,"For serious adopter, please do sms or call for...",7,1


#建立預測目標

In [None]:
# 建立預測目標
# 透過np.where做條件資料替換，取得AdoptionSpeed=4，未領養數據為0，領養為1
dataset['target'] = np.where(dataset['AdoptionSpeed']==4, 0, 1)

# 刪除未使用的列
dataset = dataset.drop(columns=['AdoptionSpeed', 'Description'])

In [None]:
#建立訓練、測試、驗證資料集
train, test = train_test_split(dataset, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7383 train examples
1846 validation examples
2308 test examples


In [None]:
#從dataframe轉tf.data格式
def df_to_dataset(dataset, shuffle=True, batch_size=32):
  dataset = dataset.copy()
  #取出預測目標
  labels = dataset.pop('target')#資料轉換
  ds = tf.data.Dataset.from_tensor_slices((dict(dataset), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataset))
  ds = ds.batch(batch_size)
  return ds

In [None]:
batch_size = 5 # 展示一下batch_size的內容
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
for feature_batch, label_batch in train_ds.take(1):
  print('顯示每一個特徵:', list(feature_batch.keys()))
  print('年齡的每一批次資料:', feature_batch['Age'])
  print('目標值每一批次資料:', label_batch )

顯示每一個特徵: ['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt']
年齡的每一批次資料: tf.Tensor([4 2 3 6 2], shape=(5,), dtype=int64)
目標值每一批次資料: tf.Tensor([1 1 1 1 1], shape=(5,), dtype=int64)


In [None]:
#寫一段Demo函式，可以用來觀察feature columns
example_batch = next(iter(train_ds))[0]#取得第0筆資料
def demo(feature_column):
  feature_layer = tf.keras.layers.DenseFeatures(feature_column) #基於等下給的feature_column展示Tensor的內容
  print(feature_layer(example_batch).numpy())

In [None]:
example_batch

{'Age': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([6, 2, 3, 5, 6])>,
 'Breed1': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'Mixed Breed', b'Domestic Short Hair', b'Tuxedo', b'Dalmatian',
        b'Domestic Medium Hair'], dtype=object)>,
 'Color1': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Brown', b'Gray', b'Black', b'Black', b'Golden'], dtype=object)>,
 'Color2': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'No Color', b'White', b'White', b'Gray', b'White'], dtype=object)>,
 'Fee': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 0,  0,  0, 50,  0])>,
 'FurLength': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Short', b'Short', b'Long', b'Short', b'Medium'], dtype=object)>,
 'Gender': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Male', b'Male', b'Female', b'Female', b'Male'], dtype=object)>,
 'Health': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'Healthy', b'Healthy', b'Healthy', b'Healthy', b'Healthy'],
       dtype=object

In [None]:
#透過example_batch來展示TF裝載不同格式資料的方式

In [None]:
#觀看該寵物上傳的照片總數
photo_count = feature_column.numeric_column('PhotoAmt')
demo(photo_count)

[[2.]
 [7.]
 [6.]
 [1.]
 [6.]]


In [None]:
#年齡轉換為bucketized，轉換到1,3,5區間
age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(age, boundaries=[1, 3, 5])
demo(age_buckets)
#6, 2, 3, 5, 6

[[0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]


In [None]:
#寵物轉換為indicator_column
animal_type = feature_column.categorical_column_with_vocabulary_list(
      'Type', ['Cat', 'Dog'])
animal_type_one_hot = feature_column.indicator_column(animal_type)
demo(animal_type_one_hot)
#b'Dog', b'Cat', b'Cat', b'Dog', b'Cat'

[[0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]]


In [None]:
#將品種轉為embedding_column，當一個特徵有比較多數值，就會建議將數值轉為embedding_column
breed1 = feature_column.categorical_column_with_vocabulary_list(
      'Breed1', dataset.Breed1.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=8) #轉為8個元素
demo(breed1_embedding)

[[ 2.6368424e-01  1.9794904e-01  1.0925880e-01 -8.8757277e-02
  -1.5210855e-01  6.1805207e-01 -6.5286440e-01 -5.8875346e-01]
 [ 5.6207187e-02  4.6664558e-04  3.0018637e-01  5.9047192e-01
   2.1837243e-01 -5.3819394e-01  1.1317693e-01  1.8330643e-01]
 [ 4.2359334e-01  3.9783618e-01  8.7481275e-02 -5.3136524e-02
  -1.5287885e-02 -7.4953124e-02  2.1722165e-01 -2.9806530e-01]
 [-1.9744405e-01  4.4208340e-02 -5.6195849e-01 -2.9894170e-01
  -1.8420246e-01  2.5974263e-02  1.8686196e-01 -3.1319866e-01]
 [ 1.3513073e-01 -9.6416257e-02  7.3689461e-02  3.0079672e-01
   3.5942587e-01 -6.5286636e-01  2.0029575e-01  2.7677476e-01]]


In [None]:
#將品種資料轉為hash_bucket
breed1_hashed = feature_column.categorical_column_with_hash_bucket(
      'Breed1', hash_bucket_size=10)
demo(feature_column.indicator_column(breed1_hashed))

[[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


In [None]:
#將多個特徵（年齡、寵物類型）合成一個特徵，並轉為hash_bucket
crossed_feature = feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size=10)
demo(feature_column.indicator_column(crossed_feature))

[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [None]:
#清空前面測試的欄位
feature_columns = []
# 選擇做訓練的數值行
for header in ['PhotoAmt', 'Fee', 'Age']:
  feature_columns.append(feature_column.numeric_column(header))

In [None]:
feature_columns

[NumericColumn(key='PhotoAmt', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Fee', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [None]:
# 把年齡做bucketized，區間為1,2,3,4,5
age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(age, boundaries=[1, 2, 3, 4, 5])
feature_columns.append(age_buckets)

In [None]:
# 將指定欄位轉indicator_columns，並加入feature_columns
indicator_column_names = ['Type', 'Color1', 'Color2', 'Gender', 'MaturitySize',
                          'FurLength', 'Vaccinated', 'Sterilized', 'Health']
for col_name in indicator_column_names:
  categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, dataset[col_name].unique())
  indicator_column = feature_column.indicator_column(categorical_column)
  feature_columns.append(indicator_column)

In [None]:
animal_type = feature_column.categorical_column_with_vocabulary_list(
      'Type', ['Cat', 'Dog'])

animal_type_one_hot = feature_column.indicator_column(animal_type)
demo(animal_type_one_hot)

[[0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]]


In [None]:
#將品種資料轉為embedding columns
breed1 = feature_column.categorical_column_with_vocabulary_list(
      'Breed1', dataset.Breed1.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
feature_columns.append(breed1_embedding)
demo(breed1_embedding)

[[-0.15580982  0.3722761   0.44045505  0.47772503  0.41481584  0.12851077
   0.28865653  0.38193673]
 [ 0.45517522  0.5250243   0.36296305  0.07183133  0.4992528   0.4160294
  -0.258904   -0.20568205]
 [ 0.58727497 -0.61253846 -0.0403722   0.23990083 -0.19180748  0.38073212
  -0.07963162 -0.6687922 ]
 [ 0.08968242 -0.5383558  -0.34849492 -0.11525047 -0.28622463 -0.23836617
   0.47824782  0.0968101 ]
 [ 0.07311627 -0.6666989  -0.06445301 -0.26953122 -0.23101848 -0.02003135
  -0.10697258 -0.16698767]]


In [None]:
#將多個特徵（年齡、寵物類型）合成一個特徵，並轉為hash_bucket
age_type_feature = feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size=100)
feature_columns.append(feature_column.indicator_column(age_type_feature))

In [None]:
#建立特徵網路層
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
#訓練模型
model = tf.keras.Sequential([
  feature_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(.1),
  tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Epoch 1/10
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy 0.7409012317657471


In [None]:
#預測領養的速度，數值越小則領養速度越快
model.predict(next(iter(test_ds))[0])

array([[ 1.0752643 ],
       [ 1.2660422 ],
       [-0.00866931],
       [ 1.4008915 ],
       [-0.5113961 ],
       [ 3.9423954 ],
       [ 3.8328466 ],
       [ 2.7835858 ],
       [ 0.66131413],
       [ 0.68969065],
       [ 0.89744973],
       [ 2.9224386 ],
       [ 0.3232326 ],
       [-0.08829938],
       [-0.1800879 ],
       [ 2.5596764 ],
       [ 1.5758972 ],
       [ 0.45695907],
       [ 0.8428226 ],
       [-0.03696763],
       [ 1.3700149 ],
       [ 2.0845215 ],
       [ 2.0333266 ],
       [ 3.903319  ],
       [ 2.9249804 ],
       [ 2.3734372 ],
       [ 1.4671569 ],
       [ 3.4405103 ],
       [ 1.0151669 ],
       [ 2.8925798 ],
       [ 0.08667261],
       [ 2.4066722 ]], dtype=float32)