In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Regression

### 异构数据处理


许多数据集包含不同类型的特性，比如文本、浮点数和日期，每种类型的特征都需要单独的预处理或特征提取步骤。

In [None]:

# 如果有文本数据CountVectorizer()
# column_trans=make_column_transformer((OneHotEncoder(),['city']),
#       (CountVectorizer(),'title'),
#        remainder=MinMaxScaler())

## 数据划分和归一化，异构数据处理

In [None]:
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.model_selection import train_test_split

fuel = pd.read_csv('../input/dl-course-data/fuel.csv',engine='python')

# We'll do a "grouped" split to keep all of an artist's songs in one
# split or the other. This is to help prevent signal leakage.
# 防关键信息（直接影响结果）特征暴露，将其统一分在test或train
def group_split(X, y, group, train_size=0.75):
    splitter = GroupShuffleSplit(train_size=train_size)
    train, test = next(splitter.split(X, y, groups=group))
    return (X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test])


train_data = fuel.copy()
train_target = train_data.pop('FE')
X_train,X_test,y_train,y_test = train_test_split(train_data,
                                                 train_target,
                                                 test_size=0.3,
                                                 random_state=5)

X_test = pd.DataFrame(X_test)# 不知道为什么X_train

# StandardScaler是标准差归一化，也可以用MinMaxScaler
preprocessor = make_column_transformer(
    (StandardScaler(),
     make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(sparse=False),
     make_column_selector(dtype_include=object)),
)


X_train = preprocessor.fit_transform(X_train)
y_train = np.log(y_train) # log transform target instead of standardizing
# Scale to [0, 1]
X_test = preprocessor.fit_transform(X_test)
y_test = np.log(y_test)

#Uncomment to see original data
# fuel.head()
# Uncomment to see processed features
# pd.DataFrame(X_train[:10,:]).head()

# 输入维度
input_shape = [X_train.shape[1]]
print(input_shape)

In [None]:
import tensorflow as tf

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)


### 建立网络

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

with tpu_strategy.scope():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=input_shape),
        layers.Dense(128, activation='relu'),    
        layers.Dense(64, activation='relu'),
        layers.Dense(1),
    ])
    model.compile(
    optimizer='adam',
    loss='mae',
    )
    


### Dropout and Batch Normalization
The first of these is the "dropout layer", which can help correct overfitting.Whan adding dropout, you may need to increase the number of units in your Dense layers.

The next special layer we'll look at performs "batch normalization" (or "batchnorm"), which can help correct training that is slow or unstable.

In [None]:
# from tensorflow import keras
# from tensorflow.keras import layers

# model = keras.Sequential([
#     layers.Dense(1024, activation='relu', input_shape=input_shape),
#     layers.Dropout(0.3),
#     layers.BatchNormalization(),
#     layers.Dense(1024, activation='relu'),
#     layers.Dropout(0.3),
#     layers.BatchNormalization(),
#     layers.Dense(1024, activation='relu'),
#     layers.Dropout(0.3),
#     layers.BatchNormalization(),
#     layers.Dense(1),
# ])

### 指定optimizer和loss function

In [None]:
#     model.compile(
#         optimizer='adam',
#         loss='mae',
#     )

### 防过/欠拟合：设置early——stopping
这些参数表示：“如果在过去的20个时期内，验证损失至少没有提高0.001，那么就停止训练并保持找到的最佳模型。”

In [None]:
from tensorflow.keras import callbacks

# YOUR CODE HERE: define an early stopping callback
early_stopping =  callbacks.EarlyStopping(
    min_delta=0.002, # minimium amount of change to count as an improvement
    patience=10, # how many epochs to wait before stopping
    restore_best_weights=True,
)

### 开始训练

In [None]:

history = model.fit(
    X_train, y_train,
    validation_data=(X_train, y_train),
    batch_size=50,
    epochs=200,
    callbacks=[early_stopping]
    #,verbose=0  # turn off training log
)

### 画损失函数趋势图

In [None]:
import pandas as pd

# convert the training history to a dataframe
history_df = pd.DataFrame(history.history)
# use Pandas native plot method
history_df.loc[:, ['loss', 'val_loss']].plot()
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));

# Binary Classification

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

hotel = pd.read_csv('../input/dl-course-data/hotel.csv')

X = hotel.copy()
y = X.pop('is_canceled')

X['arrival_date_month'] = \
    X['arrival_date_month'].map(
        {'January':1, 'February': 2, 'March':3,
         'April':4, 'May':5, 'June':6, 'July':7,
         'August':8, 'September':9, 'October':10,
         'November':11, 'December':12}
    )

features_num = [
    "lead_time", "arrival_date_week_number",
    "arrival_date_day_of_month", "stays_in_weekend_nights",
    "stays_in_week_nights", "adults", "children", "babies",
    "is_repeated_guest", "previous_cancellations",
    "previous_bookings_not_canceled", "required_car_parking_spaces",
    "total_of_special_requests", "adr",
]
features_cat = [
    "hotel", "arrival_date_month", "meal",
    "market_segment", "distribution_channel",
    "reserved_room_type", "deposit_type", "customer_type",
]

transformer_num = make_pipeline(
    SimpleImputer(strategy="constant"), # there are a few missing values
    StandardScaler(),
)
transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

# stratify - make sure classes are evenlly represented across splits
X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y, stratify=y, train_size=0.75)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

input_shape = [X_train.shape[1]]

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# YOUR CODE HERE: define the model given in the diagram
model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1,activation='sigmoid'),
])

In [None]:
model.compile( optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)
history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=200,
    callbacks=[early_stopping],
)

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")