In [1]:
import torch

In [2]:
import tensorflow as tf

import pandas as pd
 
df = pd.DataFrame({'Department1':[78,16,89],
                   'Department2': ['Science','Maths','Biology']})
 
df
new_df_val = tf.data.Dataset.from_tensor_slices(dict(df))
new_df_val 
for i in new_df_val .take(3):
    print(i)

{'Department1': <tf.Tensor: shape=(), dtype=int64, numpy=78>, 'Department2': <tf.Tensor: shape=(), dtype=string, numpy=b'Science'>}
{'Department1': <tf.Tensor: shape=(), dtype=int64, numpy=16>, 'Department2': <tf.Tensor: shape=(), dtype=string, numpy=b'Maths'>}
{'Department1': <tf.Tensor: shape=(), dtype=int64, numpy=89>, 'Department2': <tf.Tensor: shape=(), dtype=string, numpy=b'Biology'>}


In [3]:
df = pd.DataFrame({'Department1':[178,965,156],
                   'Department2': ['Chemistry','Maths','Biology']})
 
df
new_df_val = tf.data.Dataset.from_tensor_slices(dict(df))
new_df_val 
for i in new_df_val .take(3):
    print(i)

{'Department1': <tf.Tensor: shape=(), dtype=int64, numpy=178>, 'Department2': <tf.Tensor: shape=(), dtype=string, numpy=b'Chemistry'>}
{'Department1': <tf.Tensor: shape=(), dtype=int64, numpy=965>, 'Department2': <tf.Tensor: shape=(), dtype=string, numpy=b'Maths'>}
{'Department1': <tf.Tensor: shape=(), dtype=int64, numpy=156>, 'Department2': <tf.Tensor: shape=(), dtype=string, numpy=b'Biology'>}


In [4]:
print(tf.config.list_physical_devices())

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [5]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [6]:
from urllib import request as rq
import pandas as pd
import os
import pyarrow as pa # this is needed for the parquet file
import numpy as np
import ipywidgets
from ipywidgets import widgets
from ipywidgets import interact, interactive, fixed, VBox
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns



In [7]:
# Function to load the Dublin bus gz files 
def load_Files(direc, files, comtype):
    columns = ['Timestamp', 'LineID', 'Direction', 'JourneyPatternID', 'TimeFrame', 'VehicleJourneyID', 'Operator', 'Congestion', 'LonWGS84', 'LatWGS84', 'Delay', 'BlockID', 'VehicleID', 'StopID', 'AtStop']
    for f in files:
        print(f)
        yield pd.read_csv(direc + f, compression=comtype, delimiter=',', header=0, names=columns, parse_dates=True, low_memory=True)

def crackit_open(busFile):
    import zipfile as zip
    # Zip creates its own folders - no need to check for folder existence
    with zip.ZipFile(busFile,  mode='r') as arc: 
        arc.extractall('./Data/Bus/Gz/')  
    files = os.listdir('./Data/Bus/Gz/')
    DBfiles = [f for f in files if f.endswith('.gz')]
    df = pd.concat(load_Files('./Data/Bus/Gz/', DBfiles, 'gzip'), copy = False)
    return df

def shapiro_test(x):
    p_val = stats.shapiro(x)[1]
    status = 'passed'
    color = 'blue'
    if p_val < 0.05:
        status = 'failed'
        color = 'red'
    return status, color, p_val

def custom_scatterplot(df1, col1=''):
    df1 = df1[df1["LineID"]==col1]
    f = plt.figure()
    f, ax = plt.subplots(figsize=(11.5, 11.5))
    ax = f.add_subplot(projection='3d')
    ax.scatter(df1['LonWGS84'], df1['LatWGS84'], df1['Hour'], alpha=0.6, color=df1['Colour'])
    ax.set_xlabel('Longitude')
    ax.set_ylabel('Latitude')
    ax.set_zlabel('Hour')
    dcol = str(col1)
    plt.savefig('./Images/Img_' + dcol + '_Longitude_Latitude_Hour.svg')
    df1.to_parquet('./Data/Bus/LineID_' + dcol + '.parquet')
    
    
def custom_barplot(df1, col1=''):
    if len(df1[col1]) > 5000: # added this to the function because of warnings about the size of data being used with shapiro test
            sampleSize = 5000
    else:
        sampleSize = len(df1[col1])
    df1 = df1.sample(sampleSize) #shapiro test is unreliable over 5000 https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test and performance reasons
    f, ax = plt.subplots(2,2, figsize=(11.5, 11.5))
    ax = ax.reshape(-1)
    df1[col1].plot(ax=ax[0], kind='hist')
    ax[0].set_title('Histogram of {}'.format(col1))
    df1[col1].plot(ax=ax[1], kind='kde')
    ax[1].set_title('Density Plot of {}'.format(col1))
    ax3 = plt.subplot(223)
    stats.probplot(df[col1], plot=plt)
    ax[2].set_title('QQ Plot of {}'.format(col1))
    df1[col1].plot(ax=ax[3], kind='box')
    ax[3].set_title('Box Plot of {}'.format(col1))
    status, color, p_val = shapiro_test(df1[col1]) 
    f.suptitle('Normality test for {} {} (p_value = {})'.format(col1, status, p_val), color=color, fontsize=12)

def num_missing(x):
    return len(x.index)-x.count()

def num_unique(x):
    return len(np.unique(x))

def load_csv_Files(direc, files):

    for f in files:
        # need to get number of rows to skip 
        temp=pd.read_csv(direc + f,sep='^',header=None,prefix='X')
        temp2=temp.X0.str.split(',',expand=True)
        del temp['X0']
        temp=pd.concat([temp,temp2],axis=1)
        cols = list(range(0,temp.shape[1]))

        print(f)
        yield pd.read_csv(direc + f,  delimiter=',', header=0,  parse_dates=True, low_memory=True, skiprows=14, usecols=cols, na_values='NAN')

In [8]:
if os.path.exists('./Data'):
    print('Data folder exists')

if os.path.exists('./Data/Bus'):
    print('Bus Data folder exists')
else:
    os.makedirs('./Data/Bus')

if os.path.exists('./Data/MetEirrean/'):
    print('Weather data folder exists')
else:
    os.makedirs('./Data/MetEirrean/')

if os.path.exists('./Zips/MetEirrean'):
    print('Weather folder exisits')
else:
    os.makedirs('./Zips/MetEirrean/')

if os.path.exists('./Images/'):
    print('Images folder exists')
else:
    os.makedirs('./Images/')



Data folder exists
Bus Data folder exists
Weather data folder exists
Weather folder exisits
Images folder exists


In [9]:
# Check parquet file existence before downloading - iof starting from afresh this takes a long time
if os.path.exists('./Data/CleanedBusData.parquet'):
    print("Parquest file exists, means the Bus data has been downloaded already ")
    df = pd.read_parquet('./Data/CleanedBusData.parquet')
    if os.path.exists('./Data/WeatherandBusData.parquet'):
        print('Weather and bus data combined exists')
        mdf = pd.read_parquet('./Data/WeatherandBusData.parquet')
elif os.path.exists('./Zips/Bus/DublinBusdata.zip'):
    print("Zip file exists, we have already downloaded the Dublin Bus Zip data, crack it open")
    df = crackit_open('./Zips/Bus/DublinBusdata.zip')
else:
    os.makedirs('./Zips/Bus/', exist_ok = True)
    url = "https://opendata.dublincity.ie/TrafficOpenData/sir010113-310113.zip"
    busFile = rq.urlretrieve(url, './Zips/Bus/DublinBusdata.zip' )  
    df = crackit_open('./Zips/Bus/DublinBusdata.zip')

df = df.sample(20000000)
### Read the Bus data in to a Pandas dataframe - done

Zip file exists, we have already downloaded the Dublin Bus Zip data, crack it open
siri.20130101.csv.gz
siri.20130102.csv.gz
siri.20130103.csv.gz
siri.20130104.csv.gz
siri.20130105.csv.gz
siri.20130106.csv.gz
siri.20130107.csv.gz
siri.20130108.csv.gz
siri.20130109.csv.gz
siri.20130110.csv.gz
siri.20130111.csv.gz
siri.20130112.csv.gz
siri.20130113.csv.gz
siri.20130114.csv.gz
siri.20130115.csv.gz
siri.20130116.csv.gz
siri.20130117.csv.gz
siri.20130118.csv.gz
siri.20130119.csv.gz
siri.20130120.csv.gz
siri.20130121.csv.gz
siri.20130122.csv.gz
siri.20130123.csv.gz
siri.20130124.csv.gz
siri.20130125.csv.gz
siri.20130126.csv.gz
siri.20130127.csv.gz
siri.20130128.csv.gz
siri.20130129.csv.gz
siri.20130130.csv.gz
siri.20130131.csv.gz


In [10]:
if 'Direction' in df.columns:
    df = df.drop(['Direction'], axis='columns')
if 'TimeFrame' in df.columns:
    df = df.drop(['TimeFrame'], axis='columns')
""" if {'VehicleJourneyID', 'JourneyPatternID'}.issubset(df.columns):  
    df['VehicleJourneyID'] = df['JourneyPatternID'] + '_' + df['VehicleJourneyID'].astype('str')
    df = df.drop(['JourneyPatternID','VehicleJourneyID'], axis='columns') """

df = df.dropna() 
df = df.drop_duplicates()

temp_df = df.describe().T
missing_df = pd.DataFrame(df.apply(num_missing, axis=0)) 
missing_df.columns = ['missing']  # type: ignore

display(temp_df)

display(missing_df)



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Timestamp,17007287.0,1358318000000000.0,738370500000.0,1356998000000000.0,1357680000000000.0,1358328000000000.0,1358955000000000.0,1359633000000000.0
LineID,17007287.0,78.41271,119.4481,1.0,18.0,40.0,83.0,747.0
VehicleJourneyID,17007287.0,10273.93,66829.41,1.0,2744.0,4814.0,6861.0,999856.0
Congestion,17007287.0,0.01366826,0.1161096,0.0,0.0,0.0,0.0,1.0
LonWGS84,17007287.0,-6.270799,0.08182937,-6.617517,-6.306787,-6.26167,-6.23214,-6.053033
LatWGS84,17007287.0,53.34426,0.05437022,53.06802,53.31951,53.34611,53.37347,53.60873
Delay,17007287.0,-53.37298,484.8823,-15045.0,-266.0,-21.0,114.0,116122.0
BlockID,17007287.0,102723.3,182311.5,390.0,16014.0,40108.0,83012.0,835002.0
VehicleID,17007287.0,35480.41,3307.243,28047.0,33315.0,33529.0,38030.0,43078.0
StopID,17007287.0,2647.753,2126.668,2.0,810.0,2039.0,4320.0,7552.0


Unnamed: 0,missing
Timestamp,0
LineID,0
JourneyPatternID,0
VehicleJourneyID,0
Operator,0
Congestion,0
LonWGS84,0
LatWGS84,0
Delay,0
BlockID,0


In [11]:
print ('The data has {} Rows and {} columns'.format(df.shape[0],df.shape[1]))
print("The types of columns are:")
display(df.dtypes)

The data has 17007287 Rows and 13 columns
The types of columns are:


Timestamp             int64
LineID              float64
JourneyPatternID     object
VehicleJourneyID      int64
Operator             object
Congestion            int64
LonWGS84            float64
LatWGS84            float64
Delay                 int64
BlockID               int64
VehicleID             int64
StopID              float64
AtStop                int64
dtype: object

In [12]:
new_df_val = tf.data.Dataset.from_tensor_slices(dict(df))

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.