In [2]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
# step 2 load data
weather = pd.read_csv('donetsk_17_21_with_storms.csv', sep=';', header=0, parse_dates=True, squeeze=True)
weather.head()

Unnamed: 0,started_at,cloud_height,cloud_amount,wind_direction,wind_speed,temperature,temperature_dew,pressure,pressure_tendency,pressure_tendency_value,code
0,2017-09-01 00:00:00,5,8,0,0,14.4,13.5,989.4,7,1.3,0
1,2017-09-01 03:00:00,5,8,0,0,14.2,13.4,989.3,7,0.1,0
2,2017-09-01 06:00:00,5,8,0,0,15.2,13.2,989.7,3,0.4,0
3,2017-09-01 09:00:00,5,6,25,2,17.7,13.2,988.9,8,0.8,8
4,2017-09-01 12:00:00,5,7,25,4,16.5,13.4,989.1,3,0.2,8


In [3]:
# weather['wind_direction'] = weather['wind_direction']*10
# weather.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cloud_height,11360.0,6.929049,2.159598,2.0,5.0,6.0,9.0,10.0
cloud_amount,11360.0,4.674912,3.246734,0.0,0.0,6.0,8.0,9.0
wind_direction,11360.0,137.132923,117.341025,0.0,20.0,110.0,250.0,360.0
wind_speed,11360.0,2.798063,2.216409,0.0,2.0,2.0,4.0,16.0
temperature,11360.0,10.70515,10.880943,-20.8,1.2,10.3,19.7,37.8
temperature_dew,11360.0,4.112782,7.817085,-22.0,-1.5,4.2,10.4,21.5
pressure,11360.0,992.652729,7.295076,964.2,987.9,992.0,997.5,1015.3
pressure_tendency,11360.0,4.805722,2.476551,1.0,2.0,4.0,7.0,8.0
pressure_tendency_value,11360.0,0.735211,0.648641,0.0,0.3,0.6,1.0,7.1
code,11360.0,0.269366,1.041262,0.0,0.0,0.0,0.0,8.0


In [4]:
weather['wind_direction'] = weather['wind_direction']*10
wv = weather.pop('wind_speed')

# Convert to radians.
wd_rad = weather.pop('wind_direction')*np.pi / 180

# Calculate the wind x and y components.
weather['w_x'] = wv*np.cos(wd_rad)
weather['w_y'] = wv*np.sin(wd_rad)

In [5]:
date_time = pd.to_datetime(weather.pop('started_at'), format='%Y-%m-%d %H:%M:%S')
timestamp_s = date_time.map(pd.Timestamp.timestamp)

In [6]:
day = 24*60*60
year = (365.2425)*day

weather['day_sin'] = np.sin(timestamp_s * (2 * np.pi / day))
weather['day_cos'] = np.cos(timestamp_s * (2 * np.pi / day))
weather['year_sin'] = np.sin(timestamp_s * (2 * np.pi / year))
weather['year_cos'] = np.cos(timestamp_s * (2 * np.pi / year))

In [17]:
# weather.head()
weather.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cloud_height,11360.0,6.929049,2.159598,2.0,5.0,6.0,9.0,10.0
cloud_amount,11360.0,4.674912,3.246734,0.0,0.0,6.0,8.0,9.0
temperature,11360.0,10.70515,10.880943,-20.8,1.2,10.3,19.7,37.8
temperature_dew,11360.0,4.112782,7.817085,-22.0,-1.5,4.2,10.4,21.5
pressure,11360.0,992.6527,7.295076,964.2,987.9,992.0,997.5,1015.3
pressure_tendency,11360.0,4.805722,2.476551,1.0,2.0,4.0,7.0,8.0
pressure_tendency_value,11360.0,0.7352113,0.648641,0.0,0.3,0.6,1.0,7.1
code,11360.0,0.2693662,1.041262,0.0,0.0,0.0,0.0,8.0
w_x,11360.0,-0.01415475,2.13474,-10.0,-1.285575,0.0,1.285575,10.0
w_y,11360.0,0.480422,2.820315,-10.0,-0.68404,0.0,1.879385,16.0


In [7]:
train_splits = []
test_splits = []

for _, group_data in weather.groupby("code"):
    random_selection = np.random.rand(len(group_data.index)) <= 0.8
    train_splits.append(group_data[random_selection])
    test_splits.append(group_data[~random_selection])

train_data = pd.concat(train_splits).sample(frac=1).reset_index(drop=True)
test_data = pd.concat(test_splits).sample(frac=1).reset_index(drop=True)

print("Train split size: ",len(train_data.index))
print("Test split size: ",len(test_data.index))


Train split size:  9158
Test split size:  2202


In [8]:
train_data_file = "train_data.csv"
test_data_file = "test_data.csv"

train_data.to_csv(train_data_file, index=False)
test_data.to_csv(test_data_file, index=False)

In [75]:
CSV_HEADER = [
    "cloud_height",
    "cloud_amount",
    "temperature",
    "temperature_dew",
    "pressure",
    "pressure_tendency",
    "pressure_tendency_value",
    "w_x",
    "w_y",
    "day_sin",
    "day_cos",
    "year_sin",
    "year_cos",
    "code",
]
TARGET_FEATURE_NAME = "code"

TARGET_FEATURE_LABELS = [0, 1, 2, 3, 4, 5, 6, 7, 8]

NUMERIC_FEATURE_NAMES = [
    "cloud_height",
    "cloud_amount",
#     "temperature",
#     "temperature_dew",
#     "pressure",
#     "pressure_tendency",
#     "pressure_tendency_value",
#     "w_x",
#     "w_y",
#     "day_sin",
#     "day_cos",
#     "year_sin",
#     "year_cos",
]

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
#     "cloud_height": list(weather["cloud_height"].unique()),
#     "cloud_amount": list(weather["cloud_amount"].unique()),
#     "pressure_tendency": list(weather["pressure_tendency"].unique())
}

CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME] else ["NA"]
    for feature_name in CSV_HEADER
]

NUM_CLASSES = len(TARGET_FEATURE_LABELS)
# COLUMN_DEFAULTS

In [76]:
def get_dataset_from_csv(csv_file_path, batch_size, shuffle=False):

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=True,
        shuffle=shuffle,
    )
    return dataset.cache()
# NUMERIC_FEATURE_NAMES

In [77]:
learning_rate = 0.001
dropout_rate = 0.1
batch_size = 265
num_epochs = 50

hidden_units = [32, 32]


def run_experiment(model):

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )

    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)

    test_dataset = get_dataset_from_csv(test_data_file, batch_size)

    print("Start training the model...")
    history = model.fit(train_dataset, epochs=num_epochs)
    print("Model training finished")

    _, accuracy = model.evaluate(test_dataset, verbose=0)

    print("Test accuracy: ",round(accuracy * 100, 2))


In [78]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

In [79]:
# from tensorflow.keras.layers.experimental.preprocessing import StringLookup

def encode_inputs(inputs, use_embedding=False):
    encoded_features = []
    for feature_name in inputs:
#         if feature_name in CATEGORICAL_FEATURE_NAMES:
#             vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
#             # Create a lookup to convert string values to an integer indices.
#             # Since we are not using a mask token nor expecting any out of vocabulary
#             # (oov) token, we set mask_token to None and  num_oov_indices to 0.
#             lookup = StringLookup(
#                 vocabulary=vocabulary,
#                 mask_token=None,
#                 num_oov_indices=0,
# #                 output_mode="int" if use_embedding else "binary",
#             )
# #             if use_embedding:
#             # Convert the string input values into integer indices.
#             encoded_feature = lookup(inputs[feature_name])
#             embedding_dims = int(math.sqrt(len(vocabulary)))
#             # Create an embedding layer with the specified dimensions.
#             embedding = layers.Embedding(
#                 input_dim=len(vocabulary), output_dim=embedding_dims
#             )
#             # Convert the index values to embedding representations.
#             encoded_feature = embedding(encoded_feature)
# #             else:
#                 # Convert the string input values into a one hot encoding.
# #                 encoded_feature = lookup(tf.expand_dims(inputs[feature_name], -1))
#         else:
            # Use the numerical features as-is.
        encoded_feature = tf.expand_dims(inputs[feature_name], -1)

        encoded_features.append(encoded_feature)

    all_features = layers.concatenate(encoded_features)
    return all_features

In [80]:
def create_baseline_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)
    
    for units in hidden_units:
        features = layers.Dense(units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.ReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

baseline_model = create_baseline_model()
# keras.utils.plot_model(baseline_model, show_shapes=True, rankdir="LR")

In [81]:
run_experiment(baseline_model)

Start training the model...
Epoch 1/50


  [n for n in tensors.keys() if n not in ref_input_names])


      1/Unknown - 0s 122us/step - loss: 3.0976 - sparse_categorical_accuracy: 0.0000e+00

InvalidArgumentError:  Received a label value of -1 which is outside the valid range of [0, 9).  Label values: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-77-d4399da19dd6>:22) ]] [Op:__inference_train_function_17349]

Function call stack:
train_function
