In [1]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

2023-02-25 22:47:50.373565: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-25 22:47:53.059035: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/prp/anaconda3/lib/
2023-02-25 22:47:53.059129: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/prp/anaconda3/lib/


In [2]:
zip_path = tf.keras.utils.get_file(
    origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
    fname='jena_climate_2009_2016.csv.zip',
    extract=True)
csv_path, _ = os.path.splitext(zip_path) #We load the dataset in a csv_file

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip


In [3]:
df = pd.read_csv(csv_path)

In [4]:
def preprocessing(data):
    
    # Getting rid of outliers
    data.loc[df['wv (m/s)'] == -9999.0, 'wv (m/s)'] = 0.0
    data.loc[df['max. wv (m/s)'] == -9999.0, 'max. wv (m/s)'] = 0.0
    
    # Taking values every hours
    data = data[5::6]# df[start,stop,step]
    
    wv = data.pop('wv (m/s)')
    max_wv = data.pop('max. wv (m/s)')

    # Convert to radians.
    wd_rad = data.pop('wd (deg)')*np.pi / 180

    # Calculate the wind x and y components.
    data['Wx'] = wv*np.cos(wd_rad)
    data['Wy'] = wv*np.sin(wd_rad)

    # Calculate the max wind x and y components.
    data['max Wx'] = max_wv*np.cos(wd_rad)
    data['max Wy'] = max_wv*np.sin(wd_rad)
    
    date_time = pd.to_datetime(data.pop('Date Time'), format='%d.%m.%Y %H:%M:%S')
    timestamp_s = date_time.map(datetime.datetime.timestamp)
    
    day = 24*60*60 # Time is second within a single day
    year = 365.2425*day # Time in second withon a year

    data['Day sin'] = np.sin(timestamp_s * (2*np.pi / day))
    data['Day cos'] = np.cos(timestamp_s * (2*np.pi / day))
    data['Year sin'] = np.sin(timestamp_s * (2*np.pi / year))
    data['Year cos'] = np.cos(timestamp_s * (2*np.pi / year))
    
    return(data)

In [5]:
def split(data):
    
    n = data.shape[0]
    
    train_df = data.iloc[0: n * 70 //100] # "iloc" because we have to select the lines at the indicies 0 to int(n*0.7) compared to "loc"
    val_df = data.iloc[n * 70 //100 : n * 90 //100]
    test_df = data.iloc[n * 90 //100:]
    
    return(train_df, val_df, test_df)

In [6]:
df_processed = preprocessing(df)

train_df, val_df, test_df = split(df_processed)

train_mean = train_df.mean() # returns a one column panda dataframe (serie) containing the mean of every columns
train_std = train_df.std() # same with standard deviation

train_df = (train_df - train_mean)/train_std # As simple as that !
val_df = (val_df - train_mean)/train_std
test_df = (test_df - train_mean)/train_std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Wx'] = wv*np.cos(wd_rad)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Wy'] = wv*np.sin(wd_rad)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['max Wx'] = max_wv*np.cos(wd_rad)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [7]:
type(train_df) 

pandas.core.frame.DataFrame

In [8]:
lookback = 48 # Looking at all features for the past 2 days
delay = 24 # Trying to predict the temperature for the next day
window_length = lookback + delay
batch_size = 32 # Features will be batched 32 by 32.

In [9]:
def create_dataset(X, y, delay=24):
    # X and y should be pandas dataframes
    Xs, ys = [], []
    for i in range(lookback, len(X)-delay):
        v = X.iloc[i-lookback:i].to_numpy() # every one hour, we take the past 48 hours of features
        Xs.append(v)
        w = y.iloc[i+delay] # Every timestep, we take the temperature the next delay (here one day)
        ys.append(w)
    return(np.array(Xs), np.array(ys))

In [11]:
X_train, y_train = create_dataset(train_df, train_df['T (degC)'], delay = delay)
X_val, y_val = create_dataset(val_df, val_df['T (degC)'], delay = delay)

In [12]:
print("X_train shape is {}: ".format(X_train.shape))
print("y_train shape is {}: ".format(y_train.shape))

print("\nX_val shape is {}: ".format(X_val.shape))
print("y_val shape is {}: ".format(y_val.shape))

X_train shape is (48991, 48, 19): 
y_train shape is (48991,): 

X_val shape is (13946, 48, 19): 
y_val shape is (13946,): 


In [13]:
def naive_eval_arr(X, y, lookback, delay):
    batch_maes = []
    for i in range(0, len(X)):
        preds = X[i, -1, 1] #For all elements in the batch, we are saying the prediction of temperature is equal to the last temperature recorded within the 48 hours
        mae = np.mean(np.abs(preds - y[i]))
        batch_maes.append(mae)
    return(np.mean(batch_maes))

naive_loss_arr = naive_eval_arr(X_val, y_val, lookback = lookback, delay = delay)

naive_loss_arr = round(naive_eval_arr(X_val, y_val, lookback = lookback, delay = delay),2) # Round the value
print(naive_loss_arr)

0.32


In [14]:
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.optimizers import RMSprop

In [15]:
# Let's start with a simple Dense model
model = Sequential([
    Flatten(input_shape=(lookback, 19)),
    Dense(32, activation='relu'),
    Dense(1) # We try to predict only one value for now
])

2023-02-25 23:00:08.018070: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-02-25 23:00:08.130655: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-02-25 23:00:08.130719: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-02-25 23:00:08.133214: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, 

In [16]:
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mae')
history = model.fit(X_train, y_train, epochs = 30, validation_data = (X_val, y_val), batch_size = 32)

Epoch 1/30


2023-02-25 23:00:20.485523: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x26f25880 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-02-25 23:00:20.485565: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce GTX 1660 Ti, Compute Capability 7.5
2023-02-25 23:00:20.540407: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-02-25 23:00:21.068097: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-02-25 23:00:21.197181: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
