Install neccessary packages 

In [1]:
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.


In [1]:
from urllib import request as rq
import pandas as pd
import os
import pyarrow as pa # not sure will i use this 
import numpy as np
import ipywidgets
from ipywidgets import widgets
from ipywidgets import interact, interactive, fixed, VBox
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder

Download the Dublin Bus data from https://data.gov.ie/organization/dublin-city-council?tags=Transport+and+Infrastructure

In [3]:
# Check folder existence before downloading
if os.path.exists('./Zips/Bus'):
    print("Zips folder exists")
else:
    os.makedirs('./Zips/Bus/', exist_ok = True)

url = "https://opendata.dublincity.ie/TrafficOpenData/sir010113-310113.zip"
busFile = rq.urlretrieve(url, './Zips/Bus/DublinBusdata.zip' )    

Zips folder exists


In [4]:
import zipfile as zip
# Zip creates its own folders - no need to check for folder existence
with zip.ZipFile(busFile[0],  mode='r') as arc: 
    arc.extractall('./Data/Bus/Gz/')

Use a generator to load the data into pandas data frame https://pandasninja.com/2019/04/how-to-read-lots-of-csv-files-easily-into-pandas/#:~:text=How%20to%20read%20lots%20of%20csv%20files%20easily,need%20...%204%204.%20Leverage%20regular%20expressions%20


In [28]:

def load_Files(files):
    columns = ['Timestamp', 'LineID', 'Direction', 'JourneyPatternID', 'TimeFrame', 'VehicleJourneyID', 'Operator', 'Congestion', 'LonWGS84', 'LatWGS84', 'Delay', 'BlockID', 'VehicleID', 'StopID', 'AtStop']
    for f in files:
        print(f)
        yield pd.read_csv('./Data/Bus/Gz/' + f, compression='gzip', delimiter=',', header=0, names=columns, parse_dates=True, low_memory=True)

files = os.listdir('./Data/Bus/Gz/')
DBfiles = [f for f in files if f.endswith('.gz')]

if os.path.exists('./Data/Bus/CleanedBusData.parquet'):
    df = pd.read_parquet('./Data/Bus/CleanedBusData.parquet')
else:
    df = pd.concat(load_Files(DBfiles))

Download the weather station details https://cli.fusio.net/cli/climate_data/webdata/StationDetails.csv

In [29]:
# Getting the list of categorical columns
CategoricalColumns = list(set(list(df.select_dtypes(exclude=[np.number]).columns)))
NumericalColumns = list(set(list(df.select_dtypes([np.number]).columns)))

print(CategoricalColumns)
print(NumericalColumns)

ColumnsToExclude = ['JourneyPatternID', 'StopID']
CategoricalColumns = list(set(CategoricalColumns) - set(ColumnsToExclude))
NumericalColumns = list(set(NumericalColumns)-set(ColumnsToExclude))




['TimeFrame', 'Operator']
['VehicleJourneyID', 'Timestamp', 'Direction', 'LatWGS84', 'Delay', 'BlockID', 'Congestion', 'VehicleID', 'AtStop', 'LineID', 'LonWGS84']


In [None]:
df = df.drop(ColumnsToExclude, axis=1)
df = df.dropna()


In [30]:
df.head()

Unnamed: 0,Timestamp,LineID,Direction,TimeFrame,VehicleJourneyID,Operator,Congestion,LonWGS84,LatWGS84,Delay,BlockID,VehicleID,AtStop
0,1356998405000000,27.0,0,2012-12-31,3883,RD,0,-6.233417,53.342232,0,27017,33521,0
1,1356998407000000,40.0,0,2012-12-31,2226,HN,0,-6.27825,53.416683,0,40206,33142,0
2,1356998407000000,7.0,0,2012-12-31,6106,D1,0,-6.231633,53.317768,0,7019,43004,1
3,1356998411000000,747.0,0,2012-12-31,3531,SL,0,-6.254617,53.355484,-454,747007,40039,0
4,1356998411000000,56.0,0,2012-12-31,1830,RD,0,-6.233183,53.342201,0,56001,33488,0


In [31]:
df['datetime'] = pd.to_datetime(df['Timestamp'], unit='us')

In [32]:
df.head

<bound method NDFrame.head of                Timestamp  LineID  Direction   TimeFrame  VehicleJourneyID  \
0       1356998405000000    27.0          0  2012-12-31              3883   
1       1356998407000000    40.0          0  2012-12-31              2226   
2       1356998407000000     7.0          0  2012-12-31              6106   
3       1356998411000000   747.0          0  2012-12-31              3531   
4       1356998411000000    56.0          0  2012-12-31              1830   
...                  ...     ...        ...         ...               ...   
603915  1359632694000000    39.0          0  2013-01-31              3247   
603916  1359632694000000     9.0          0  2013-01-31              3649   
603917  1359632694000000    41.0          0  2013-01-31               112   
603918  1359632694000000   145.0          0  2013-01-31              6810   
603919  1359632694000000   145.0          0  2013-01-31              6471   

       Operator  Congestion  LonWGS84   LatWG

In [33]:
print ('The data has {} Rows and {} columns'.format(df.shape[0],df.shape[1]))
print("The types of columns are:")
df.dtypes

The data has 44453044 Rows and 14 columns
The types of columns are:


Timestamp                    int64
LineID                     float64
Direction                    int64
TimeFrame                   object
VehicleJourneyID             int64
Operator                    object
Congestion                   int64
LonWGS84                   float64
LatWGS84                   float64
Delay                        int64
BlockID                      int64
VehicleID                    int64
AtStop                       int64
datetime            datetime64[ns]
dtype: object

In [36]:
def num_missing(x):
    return len(x.index)-x.count()

def num_unique(x):
    return len(np.unique(x))

temp_df = df.describe().T
missing_df = pd.DataFrame(df.apply(num_missing, axis=0)) 
missing_df.columns = ['missing']
unq_df = pd.DataFrame(df.apply(num_unique, axis=0))
unq_df.columns = ['unique']
types_df = pd.DataFrame(df.dtypes)
types_df.columns = ['DataType']

In [42]:
df['datetime'].unique()

array(['2013-01-01T00:00:05.000000000', '2013-01-01T00:00:07.000000000',
       '2013-01-01T00:00:11.000000000', ...,
       '2013-01-31T11:44:29.000000000', '2013-01-31T11:44:40.000000000',
       '2013-01-31T11:44:54.000000000'], dtype='datetime64[ns]')

In [38]:
summary_df = temp_df.join(missing_df).join(unq_df).join(types_df)
summary_df

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing,unique,DataType
Timestamp,44453044.0,1358318000000000.0,739496600000.0,1356998000000000.0,1357679000000000.0,1358327000000000.0,1358955000000000.0,1359633000000000.0,0,813102,int64
LineID,44453044.0,77.96879,114.6419,1.0,25.0,40.0,83.0,747.0,0,66,float64
Direction,44453044.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,int64
VehicleJourneyID,44453044.0,9417.338,61599.86,1.0,2536.0,4718.0,6769.0,999856.0,0,18614,int64
Congestion,44453044.0,0.01163315,0.1072279,0.0,0.0,0.0,0.0,1.0,0,2,int64
LonWGS84,44453044.0,-6.272804,0.08389086,-6.617517,-6.308651,-6.2616,-6.233166,-6.052917,0,456755,float64
LatWGS84,44453044.0,53.34513,0.05488335,53.06802,53.32005,53.34645,53.37532,53.60873,0,131327,float64
Delay,44453044.0,-28.16643,472.9787,-15045.0,-209.0,0.0,104.0,116122.0,0,12638,int64
BlockID,44453044.0,109238.0,192118.3,390.0,16020.0,40205.0,84004.0,835002.0,0,1228,int64
VehicleID,44453044.0,35429.29,3281.127,28047.0,33308.0,33525.0,38025.0,43078.0,0,911,int64


In [43]:
col_names = list(types_df.index)
num_cols = len(col_names)
index = range(num_cols)
cat_index = []
for i in index:
    if col_names[i] in CategoricalColumns:
        cat_index.append(i)
summary_df_cat = missing_df.join(unq_df).join(types_df.iloc[cat_index], how='inner') #Only summarize categorical columns
summary_df_cat

Unnamed: 0,missing,unique,DataType
TimeFrame,0,32,object
Operator,0,8,object


In [44]:
df.to_parquet('./Data/Bus/CleanedBusData.parquet')

In [45]:
url = 'https://cli.fusio.net/cli/climate_data/webdata/StationDetails.csv'


Load the weather statio details for Dublin

In [46]:
weatherStations = pd.read_csv(url)
weatherStations = weatherStations['county'].str.contains('Dublin')
weatherStations.head(10)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: county, dtype: bool

Unnamed: 0,county,station name,name,height(m),easting,northing,latitude,longitude,open year,close year
0,Antrim,5880,LH_RATHLIN_WEST,10,309200,451800,55.30083,-6.28028,2000,(null)
1,Carlow,4415,TULLOW (Waterworks),76,284700,173400,52.80528,-6.74306,1985,(null)
2,Carlow,2414,BORRIS G.S.,85,272400,150700,52.60278,-6.93056,1944,1991
3,Carlow,1214,CARLOW (SUGAR FACTORY),58,272200,178400,52.85139,-6.92778,1953,1960
4,Carlow,115,HACKETSTOWN RECTORY,182,297600,180500,52.86667,-6.55,1910,1944
