In [1]:
import azureml.core
import pandas as pd
import numpy as np
import logging

print(azureml.core.VERSION)

1.34.0


In [2]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()

experiment_name = "cab_training_experiment"

experiment = Experiment(ws, experiment_name)

In [4]:
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./data/2020/combined_2020.csv'],
                       target_path='dataset/', overwrite=True,
                       show_progress=True)

Uploading an estimated of 1 files
Uploading ./data/2020/combined_2020.csv
Uploaded ./data/2020/combined_2020.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_62cfa435f247470291148d0c9697a509

In [5]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = ''
resource_group = 'cabResourceGroup'
workspace_name = 'Cabalitics'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='cab_dataset_2020')
dataset.to_pandas_dataframe()

Unnamed: 0,year,month,day,weekday,trip_duration,tpep_dropoff_datetime,trip_distance,PULocationID,tip_amount,total_amount
2020-01-01 00:33:03,2020,1,1,3,288,2020-01-01 00:33:03,1.20,238,1.47,11.27
2020-01-01 00:43:04,2020,1,1,3,445,2020-01-01 00:43:04,1.20,239,1.50,12.30
2020-01-01 00:53:52,2020,1,1,3,371,2020-01-01 00:53:52,0.60,238,1.00,10.80
2020-01-01 01:00:14,2020,1,1,3,291,2020-01-01 01:00:14,0.80,238,1.36,8.16
2020-01-01 00:04:16,2020,1,1,3,138,2020-01-01 00:04:16,0.00,193,0.00,4.80
...,...,...,...,...,...,...,...,...,...,...
2020-12-31 23:31:36,2020,12,31,4,1563,2020-12-31 23:31:36,11.30,107,0.00,36.80
2020-12-31 23:05:33,2020,12,31,4,493,2020-12-31 23:05:33,2.18,236,2.56,15.36
2020-12-31 23:48:43,2020,12,31,4,488,2020-12-31 23:48:43,2.52,236,4.00,17.30
2020-12-31 23:57:39,2020,12,31,4,162,2020-12-31 23:57:39,0.59,238,2.08,10.38


In [9]:
data = dataset.to_pandas_dataframe()

## Data Generator

In [204]:
import numpy as np
import tensorflow as tf
import random

class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, batch_size, num_batches):
        self.data = data
        self.batch_size = batch_size
        #self.date_time_list = np.empty((self.batch_size, 4), int) #(year, month, day, timeslot)
        self.num_batches = num_batches
        self.on_epoch_end()
        
    
    def __len__(self):
        #returns the number of batches per epoch
        return self.num_batches
    

    def __getitem__(self, index): #__data_generation is redundant; could be done in this fn.
        #Generate one batch of data
        #index: index of the batch inside the epoch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X, y = self.__data_generation()
        return X, y


    def on_epoch_end(self):
        #updates indexes after each epoch
        #indexes: the complete indexing during one epoch (num_batches * batch_size) - normal case: the whole dataset
        self.indexes = np.arange(self.num_batches * self.batch_size)


    def __data_generation(self):
        start = pd.to_datetime('2020-01-01')
        end = pd.to_datetime('2021-01-01')
        random_date_list = self.random_dates(start, end)
        #END CREATE RANDOM LIST OF DATES (SIZE: BATCH_SIZE)


        date_filtered_subset = self.data.loc[(data.index <= end) & (data.index >= start)]
        #END SELECT TIME-SLOT
        x_data = np.zeros((self.batch_size, 265, 2, 4), int) #demand, month, weekday -» One-Hot encoded!
        y_data = np.zeros((self.batch_size, 265, 5), float) #demand, fare(==revenue), tip, trip_dist, trip_duration

        past_timestamps = np.arange(60, 0, -30)
        #future_timestamps = np.arange(5, 30, 5) #TOO LOW

        for b in range(self.batch_size):
            weekday, month, time = random_date_list[b].weekday(), random_date_list[b].month, random_date_list[b].
            x_data[b,:,:,1:] = [weekday, month]
            #INPUT (X_DATA):
            for d, delta in enumerate(past_timestamps):
                start = random_date_list[b]-pd.Timedelta(minutes=delta) #replace 0 with b in batch_size
                end = random_date_list[b]-pd.Timedelta(minutes=(delta-30)) #replace 0 with b in batch_size
                temp_data = self.data.loc[(data.index <= end) & (data.index >= start)]

                demand_list = temp_data['PULocationID'].value_counts()
                #x_data[:,:,1:] = [random_date_list[0].weekday(), random_date_list[0].month]
                for index, value in demand_list.items():
                    x_data[b,(index-1),d,0] = value

            #OUTPUT (Y_DATA):

            start = random_date_list[b]+pd.Timedelta(minutes=5) #replace 0 with b in batch_size
            end = random_date_list[b]+pd.Timedelta(minutes=30) #replace 0 with b in batch_size
            temp_data = self.data.loc[(data.index <= end) & (data.index >= start)]

            demand_list = temp_data['PULocationID'].value_counts()
            pls = temp_data.groupby(['PULocationID']).mean()[['trip_duration', 'trip_distance', 'tip_amount', 'total_amount']]

            for index, value in demand_list.items():
                y_data[b,(index-1),0] = value

            for index, row in pls.iterrows():
                y_data[b,(index-1), 1:] = row

        return x_data, y_data


    
    def random_dates(self, start, end):
        #CREATE RANDOM LIST OF DATES (SIZE: BATCH_SIZE)
        start_u = start.value//10**9
        end_u = end.value//10**9

        return pd.to_datetime(np.random.randint(start_u, end_u, self.batch_size), unit='s')

In [233]:
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import InputLayer, Lambda, Dropout, BatchNormalization, Dense, \
                                    Conv2DTranspose, Input, Activation, Conv2D, MaxPool2D, Reshape
from tensorflow.keras.applications.resnet50 import ResNet50

from tensorflow.keras.optimizers import Adam

def fully_connected():

    model = tf.keras.Sequential()
    model.add(InputLayer(input_shape = (265, 2, 3)))

    model.add(Dense(20, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(40, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dense(80, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dense(80, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dense(40, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dense(20, activation = 'relu'))
    model.add(BatchNormalization())

    model.add(Reshape((265,40)))

    model.add(Dense((20), activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense((10), activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(5, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))


    model.add(Dense(5, name = 'output', activation='relu'))

    model.compile(optimizer=Adam(10e-4),
                  loss='mean_squared_error',
                  metrics=['accuracy'])

    return model

def res_based():
    i = Input(shape = (265,2,3))
    backbone = ResNet50(include_top = False, weights = None, input_tensor = i)

    x = backbone.output
    o = Dense((10), activation = 'relu') (x)
    model = Model(inputs = i, outputs = x)

    model.compile(optimizer=Adam(10e-2),
                  loss='mean_squared_error',
                  metrics=['accuracy'])

    return model


model = fully_connected()
model.summary()

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_120 (Dense)            (None, 265, 2, 20)        80        
_________________________________________________________________
batch_normalization_105 (Bat (None, 265, 2, 20)        80        
_________________________________________________________________
dropout_48 (Dropout)         (None, 265, 2, 20)        0         
_________________________________________________________________
dense_121 (Dense)            (None, 265, 2, 40)        840       
_________________________________________________________________
batch_normalization_106 (Bat (None, 265, 2, 40)        160       
_________________________________________________________________
dense_122 (Dense)            (None, 265, 2, 80)        3280      
_________________________________________________________________
batch_normalization_107 (Bat (None, 265, 2, 80)      

In [234]:
generator = DataGenerator(data, 32, 10)
model_history = model.fit(x=generator,
                          epochs=30,
                          verbose=1,
                          )

  ...
    to  
  ['...']
Train for 10 steps
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30

KeyboardInterrupt: 