In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
import math

In [2]:
visibility = pd.read_csv('donetsk_17_21_visibility.csv', sep=';', header=0, parse_dates=True, squeeze=True)

In [3]:
visibility['wind_direction'] = visibility['wind_direction']*10
wv = visibility.pop('wind_speed')

# Convert to radians.
wd_rad = visibility.pop('wind_direction')*np.pi / 180

# Calculate the wind x and y components.
visibility['w_x'] = wv*np.cos(wd_rad)
visibility['w_y'] = wv*np.sin(wd_rad)

In [4]:
date_time = pd.to_datetime(visibility.pop('started_at'), format='%Y-%m-%d %H:%M:%S')
timestamp_s = date_time.map(pd.Timestamp.timestamp)
day = 24*60*60
year = (365.2425)*day

visibility['day_sin'] = np.sin(timestamp_s * (2 * np.pi / day))
visibility['day_cos'] = np.cos(timestamp_s * (2 * np.pi / day))
visibility['year_sin'] = np.sin(timestamp_s * (2 * np.pi / year))
visibility['year_cos'] = np.cos(timestamp_s * (2 * np.pi / year))
visibility.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cloud_height,11360.0,6.929049,2.159598,2.0,5.0,6.0,9.0,10.0
cloud_amount,11360.0,4.674912,3.246734,0.0,0.0,6.0,8.0,9.0
temperature,11360.0,10.70515,10.880943,-20.8,1.2,10.3,19.7,37.8
temperature_dew,11360.0,4.112782,7.817085,-22.0,-1.5,4.2,10.4,21.5
pressure,11360.0,992.6527,7.295076,964.2,987.9,992.0,997.5,1015.3
pressure_tendency,11360.0,4.805722,2.476551,1.0,2.0,4.0,7.0,8.0
pressure_tendency_value,11360.0,0.7352113,0.648641,0.0,0.3,0.6,1.0,7.1
visibility,11360.0,0.04269366,0.202174,0.0,0.0,0.0,0.0,1.0
w_x,11360.0,-0.01415475,2.13474,-10.0,-1.285575,0.0,1.285575,10.0
w_y,11360.0,0.480422,2.820315,-10.0,-0.68404,0.0,1.879385,16.0


In [7]:
train_splits = []
test_splits = []

for _, group_data in visibility.groupby("visibility"):
    random_selection = np.random.rand(len(group_data.index)) <= 0.8
    train_splits.append(group_data[random_selection])
    test_splits.append(group_data[~random_selection])

train_data = pd.concat(train_splits).sample(frac=1).reset_index(drop=True)
test_data = pd.concat(test_splits).sample(frac=1).reset_index(drop=True)

print("Train split size: ",len(train_data.index))
print("Test split size: ",len(test_data.index))

Train split size:  9090
Test split size:  2270


In [8]:
train_data_file = "train_data.csv"
test_data_file = "test_data.csv"

train_data.to_csv(train_data_file, index=False)
test_data.to_csv(test_data_file, index=False)

In [75]:
CSV_HEADER = [
    "cloud_height",
    "cloud_amount",
    "temperature",
    "temperature_dew",
    "pressure",
    "pressure_tendency",
    "pressure_tendency_value",
    "visibility",
    "w_x",
    "w_y",
    "day_sin",
    "day_cos",
    "year_sin",
    "year_cos",
    
]
TARGET_FEATURE_NAME = "visibility"

# TARGET_FEATURE_LABELS = [0, 1, 2, 3, 4, 5, 6, 7, 8]
# INT_FEATURE_NAMES = [
#     "cloud_height",
#     "cloud_amount",
#     "pressure_tendency",
# ]

NUMERIC_FEATURE_NAMES = [
    "cloud_height",
    "cloud_amount",
    "temperature",
    "temperature_dew",
    "pressure",
    "pressure_tendency",
    "pressure_tendency_value",
    "w_x",
    "w_y",
    "day_sin",
    "day_cos",
    "year_sin",
    "year_cos",
]

# CATEGORICAL_FEATURES_WITH_VOCABULARY = {
#     "cloud_height": list(weather["cloud_height"].unique()),
#     "cloud_amount": list(weather["cloud_amount"].unique()),
#     "pressure_tendency": list(weather["pressure_tendency"].unique())
# }

# CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES  #+ CATEGORICAL_FEATURE_NAMES

# COLUMN_DEFAULTS = [
#     [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME] else ["NA"]
#     for feature_name in CSV_HEADER
# ]
COLUMN_DEFAULTS = [[0.0], [0],
 [0.0],
 [0.0],
 [0.0],
 [0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0]]
# FEATURE_NAMES

In [76]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(lambda features, target: (features, target))
#     ).map(lambda features, target: (features, target_label_lookup(target)))
    return dataset.cache()

In [77]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.int32
            )
    return inputs

In [78]:
def encode_inputs(inputs):
    encoded_features = []
    for feature_name in inputs:
#         if feature_name in CATEGORICAL_FEATURE_NAMES:
#             vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
#             # Create a lookup to convert a string values to an integer indices.
#             # Since we are not using a mask token, nor expecting any out of vocabulary
#             # (oov) token, we set mask_token to None and num_oov_indices to 0.
#             lookup = StringLookup(
#                 vocabulary=vocabulary, mask_token=None, num_oov_indices=0
#             )
#             # Convert the string input values into integer indices.
#             value_index = lookup(inputs[feature_name])
#             embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
#             # Create an embedding layer with the specified dimensions.
#             embedding = layers.Embedding(
#                 input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
#             )
#             # Convert the index values to embedding representations.
#             encoded_feature = embedding(value_index)
#         else:
            # Use the numerical features as-is.
        encoded_feature = inputs[feature_name]
        if inputs[feature_name].shape[-1] is None:
            encoded_feature = tf.expand_dims(encoded_feature, -1)

        encoded_features.append(encoded_feature)

    encoded_features = layers.concatenate(encoded_features)
    return encoded_features


In [79]:
class NeuralDecisionTree(keras.Model):
    def __init__(self, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionTree, self).__init__()
        self.depth = depth
        self.num_leaves = 2 ** depth
        self.num_classes = num_classes

        # Create a mask for the randomly selected features.
        num_used_features = int(num_features * used_features_rate)
        one_hot = np.eye(num_features)
        sampled_feature_indicies = np.random.choice(
            np.arange(num_features), num_used_features, replace=False
        )
        self.used_features_mask = one_hot[sampled_feature_indicies]

        # Initialize the weights of the classes in leaves.
        self.pi = tf.Variable(
            initial_value=tf.random_normal_initializer()(
                shape=[self.num_leaves, self.num_classes]
            ),
            dtype="float32",
            trainable=True,
        )

        # Initialize the stochastic routing layer.
        self.decision_fn = layers.Dense(
            units=self.num_leaves, activation="sigmoid", name="decision"
        )

    def call(self, features):
        batch_size = tf.shape(features)[0]

        # Apply the feature mask to the input features.
        features = tf.matmul(
            features, self.used_features_mask, transpose_b=True
        )  # [batch_size, num_used_features]
        # Compute the routing probabilities.
        decisions = tf.expand_dims(
            self.decision_fn(features), axis=2
        )  # [batch_size, num_leaves, 1]
        # Concatenate the routing probabilities with their complements.
        decisions = layers.concatenate(
            [decisions, 1 - decisions], axis=2
        )  # [batch_size, num_leaves, 2]

        mu = tf.ones([batch_size, 1, 1])

        begin_idx = 1
        end_idx = 2
        # Traverse the tree in breadth-first order.
        for level in range(self.depth):
            mu = tf.reshape(mu, [batch_size, -1, 1])  # [batch_size, 2 ** level, 1]
            mu = tf.tile(mu, (1, 1, 2))  # [batch_size, 2 ** level, 2]
            level_decisions = decisions[
                :, begin_idx:end_idx, :
            ]  # [batch_size, 2 ** level, 2]
            mu = mu * level_decisions  # [batch_size, 2**level, 2]
            begin_idx = end_idx
            end_idx = begin_idx + 2 ** (level + 1)

        mu = tf.reshape(mu, [batch_size, self.num_leaves])  # [batch_size, num_leaves]
        probabilities = keras.activations.softmax(self.pi)  # [num_leaves, num_classes]
        outputs = tf.matmul(mu, probabilities)  # [batch_size, num_classes]
        return outputs

In [80]:
class NeuralDecisionForest(keras.Model):
    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionForest, self).__init__()
        self.ensemble = []
        # Initialize the ensemble by adding NeuralDecisionTree instances.
        # Each tree will have its own randomly selected input features to use.
        for _ in range(num_trees):
            self.ensemble.append(
                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
            )

    def call(self, inputs):
        # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
        batch_size = tf.shape(inputs)[0]
        outputs = tf.zeros([batch_size, num_classes])

        # Aggregate the outputs of trees in the ensemble.
        for tree in self.ensemble:
            outputs += tree(inputs)
        # Divide the outputs by the ensemble size to get the average.
        outputs /= len(self.ensemble)
        return outputs

In [81]:
learning_rate = 0.01
batch_size = 265
num_epochs = 10
hidden_units = [64, 64]


def run_experiment(model):

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )

    print("Start training the model...")
    train_dataset = get_dataset_from_csv(
        train_data_file, shuffle=True, batch_size=batch_size
    )

    model.fit(train_dataset, epochs=num_epochs)
    print("Model training finished")

    print("Evaluating the model on the test data...")
    test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)

    _, accuracy = model.evaluate(test_dataset)
    print("Test accuracy: ", round(accuracy * 100, 2))

In [82]:
num_trees = 10
depth = 10
used_features_rate = 1.0
num_classes = 2 #len(TARGET_LABELS)


def create_tree_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]

    tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)

    outputs = tree(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


tree_model = create_tree_model()
run_experiment(tree_model)

Start training the model...
Cause: could not parse the source code:

    ).map(lambda features, target: (features, target))

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not parse the source code:

    ).map(lambda features, target: (features, target))

This error may be avoided by creating the lambda in a standalone statement.

Epoch 1/10


InvalidArgumentError:  Field 0 in record is not a valid float: cloud_height
	 [[node IteratorGetNext (defined at <ipython-input-81-88cf3d0451ed>:20) ]] [Op:__inference_train_function_11980]

Function call stack:
train_function
