# The scrapper extracts the road incidence data for Sydney using the *Historical Traffic API | TfNSW Open Data Hub and Developer Portal* using the API
[Link to the API](https://opendata.transport.nsw.gov.au/dataset/historical-traffic-api)

In [1]:
import requests
import json
import pandas as pd
import datetime
import pickle
import os
from pandas_profiling import ProfileReport
import numpy as np
import re

**Below are the mandatory inputs required from the user**

In [3]:
#Define the start date, end date and name of suburb for creating directory
#these are mandatory!
start_date = "2012-01-01" #Default: 01-01-2012
end_date = "2022-09-30" #Default: Today's Date
suburb = 'Suburb'

**There are four major functions defined that provide the final dataset:**

1. scrapper()
   > extracts the data from the API for said duration and location <br>
   > writes the extracted data pickle and json file
---
2. process()
   > reads the pickle file created by scrapper() <br>
   > fetches all the required columns <br>
   > writes the processed data in pickle and csv file
---
3. clean_data()
   > keeps data only inside the bounding box coordinates for Sydney <br>
   > uses regex to clean some columns
---
4. reduce()
   > keeps only the latest record for each incidence <br>
   > keeps only the unique incidences

In [11]:
def scraper(start_date="2012-01-01", end_date=datetime.date.today().strftime(format="%Y-%m-%d"), suburb="Suburb", types=['Crash', 'Breakdown', 'Accident']):
    api_url = "https://api.transport.nsw.gov.au/v1/traffic/historicaldata"
    api_headers = {'Content-Type': 'application/json', 'Authorization': 'apikey ZRCn7V9FANYIdOBKduUwmkbfNDO1FHukUW2Z'}

    start = int(start_date[:4])
    if end_date > datetime.date.today().strftime(format="%Y-%m-%d"): end_date = datetime.date.today().strftime(
        format="%Y-%m-%d")
    end = int(end_date[:4])

    qtrs = []
    tick = 0
    for year in range(start, end + 1):
        if year % 4 == 0 and (year % 400 == 0 or year % 100 != 0):
            i = 29
        else:
            i = 28
        for month, day in zip([1, 3, 5, 7, 9, 11], [i, 30, 30, 31, 31, 31]):
            if f"{year}-{month:02}-01" <= start_date <= f"{year}-{month + 1:02}-{day}" and tick == 0:
                tick = 1
                if end_date <= f"{year}-{month + 1:02}-{day}":
                    qtrs.append([start_date, end_date])
                    break
                else:
                    qtrs.append([start_date, f"{year}-{month + 1:02}-{day}"])
            elif not (
                    f"{year}-{month:02}-01" <= start_date <= f"{year}-{month + 1:02}-{day}") and tick == 0:
                continue
            else:
                if end_date <= f"{year}-{month + 1:02}-{day}":
                    qtrs.append([f"{year}-{month:02}-01", end_date])
                    break
                else:
                    qtrs.append([f"{year}-{month:02}-01", f"{year}-{month + 1:02}-{day}"])

    body = []
    for i in range(len(qtrs)):
        body.append('{ "showHistory": true,"created":"' + qtrs[i][0] + '","end":"' + qtrs[i][
            1] + '","radius": 52,"latitude": -33.81745,"longitude": 150.9068}')

    incidents = []
    for text in body:
        try:
            response = requests.post(url=api_url, headers=api_headers, data=text)
            try:
                results = response.json()['result']
                count = 0
                for result in results:
                    if result["Hazards"]["features"]["properties"]["mainCategory"] in types:
                        incidents.append(result)
                        count += 1
                print(response)
                print(text[23:63])
                print(count, '\n')
            except KeyError:
                continue
        except:
            continue
            print('Failed')

    if not os.path.exists(f"{start_date}_{end_date}_{suburb}"):
        os.makedirs(f"{start_date}_{end_date}_{suburb}")

    with open(f"{start_date}_{end_date}_{suburb}/Raw.pkl", 'wb') as f:
        pickle.dump(incidents, f)

    with open(f"{start_date}_{end_date}_{suburb}/Raw.json", 'w') as f:
        json.dump(incidents, f)


def fetch_values(dic, *keys):
    for key in keys:
        try:
            dic = dic[key]
        except KeyError:
            return None
    return dic


def process(start_date="2012-01-01", end_date=datetime.date.today().strftime(format="%Y-%m-%d"), suburb="Suburb"):
    #if not start_date: start_date = "2012-01-01"
    #if not end_date: end_date = datetime.date.today().strftime(format="%Y-%m-%d")
    #if not suburb: suburb = "Suburb"
    with open(f"{start_date}_{end_date}_{suburb}/Raw.pkl", 'rb') as f:
        df = pickle.load(f)
        f.close()

    records = []
    for incident in df:
        dict_H = incident['Hazards']
        dict_F = dict_H['features']
        dict_P = dict_F['properties']

        ID = fetch_values(dict_F, 'id')
        mainCategory = fetch_values(dict_P, 'mainCategory')
        longitude = fetch_values(dict_F, 'geometry', 'coordinates')[0]
        latitude = fetch_values(dict_F, 'geometry', 'coordinates')[1]
        starttime = datetime.datetime.fromtimestamp(fetch_values(dict_F, 'incidentActualStartDate') / 1000.0)
        lastUpdated = datetime.datetime.fromtimestamp(fetch_values(dict_P, 'lastUpdated') / 1000.0)
        attendingGroups = ','.join(fetch_values(dict_P, 'attendingGroups'))
        displayName = fetch_values(dict_P, 'displayName')
        isMajor = fetch_values(dict_P, 'isMajor')
        diversions = fetch_values(dict_P, 'diversions')
        adviceA = fetch_values(dict_P, 'adviceA')
        adviceB = fetch_values(dict_P, 'adviceB')
        otherAdvice = fetch_values(dict_P, 'otherAdvice')
        isNewIncident = fetch_values(dict_P, 'isNewIncident')
        ended = fetch_values(dict_P, 'ended')
        subCategoryA = fetch_values(dict_P, 'subCategoryA')
        subCategoryB = fetch_values(dict_P, 'subCategoryB')
        duration = (lastUpdated - starttime).total_seconds() / 60
        try:
            roads = fetch_values(dict_P, 'roads')[0]
            if fetch_values(roads, 'impactedLanes'):
                impactedlanes = fetch_values(dict_P, 'roads')[0]['impactedLanes'][0]

                if fetch_values(dict_P, 'periods'):
                    try:
                        closureType = fetch_values(dict_P, 'periods')[0]['closureType']
                        direction = fetch_values(dict_P, 'periods')[0]['direction']
                    except:
                        closureType = fetch_values(impactedlanes, 'extent')
                        direction = fetch_values(impactedlanes, 'affectedDirection')
                else:
                    closureType = fetch_values(impactedlanes, 'extent')
                    direction = fetch_values(impactedlanes, 'affectedDirection')

                closedLanes = fetch_values(impactedlanes, 'closedLanes')
                numberOfLanes = fetch_values(impactedlanes, 'numberOfLanes')

            mainStreet = fetch_values(roads, 'mainStreet')
            Suburb = fetch_values(roads, 'suburb')
            trafficVolume = fetch_values(roads, 'trafficVolume')

            record = [ID, mainCategory, longitude, latitude, starttime, lastUpdated, duration, subCategoryA,
                      subCategoryB, attendingGroups, displayName, isMajor, diversions, adviceA, adviceB,
                      otherAdvice, closureType, direction, mainStreet, closedLanes, numberOfLanes, Suburb,
                      trafficVolume, ended, isNewIncident]

            records.append(record)
        except Exception as e:
            print(e)
            print(incident['Hazards']['features']['id'])
            #continue
    Final_Data = pd.DataFrame.from_records(
        records,
        columns=['ID', 'Main Category', 'Longitude', 'Latitude', 'Start Time', 'Last Updated',
                 'Duration', 'Sub Category A', 'Sub Category B', 'Attending Groups',
                 'Display Name', 'Is Major', 'Diversions', 'Advice A', 'Advice B', 'Other Advice',
                 'Closure Type', 'Direction', 'Main Street', 'Closed Lanes', 'Number of Lanes',
                 'Suburb', 'Traffic Volume', 'Ended', 'Is New Incident'])
    Final_Data.reset_index()
    with open(f"{start_date}_{end_date}_{suburb}/Processed.pkl", 'wb') as f:
        pickle.dump(Final_Data, f)
        f.close()
    Final_Data.to_csv(f"{start_date}_{end_date}_{suburb}/Processed.csv")


def clean_data(start_date="2012-01-01", end_date=datetime.date.today().strftime(format="%Y-%m-%d"), suburb="Suburb"):
    #if not start_date: start_date = "2012-01-01"
    #if not end_date: end_date = datetime.date.today().strftime(format="%Y-%m-%d")
    #if not suburb: suburb = "Suburb"
    with open(f"{start_date}_{end_date}_{suburb}/Processed.pkl", 'rb') as f:
        df = pickle.load(f)
        f.close()

    df = df[(df.Longitude >= 150.4605) & (df.Longitude <= 151.3531) & (df.Latitude >= -34.1004) &
            (df.Latitude <= -33.5345)]
    df = df[df.Duration >= 1]
    df['Display Name'] = df['Display Name'].str.lower()
    df['Main Category'] = np.where(
        ~df['Display Name'].str.contains('crash|breakdown|accident', na=False), 'Other', df['Main Category'])
    df['Main Category'] = np.where(df['Display Name'].str.contains('accident', na=False), 'Crash',
                                   df['Main Category'])
    df['Main Category'] = np.where(df['Main Category'].str.contains('Accident', na=False), 'Crash',
                                   df['Main Category'])

    for column in ['Diversions', 'Other Advice']:
        new = []
        for i in df.index:
            try:
                df.loc[i, column] = re.sub(re.compile('<.*?>'), '', df.loc[i, column])
            except:
                continue

    df.reset_index(drop=True, inplace=True)
    df.replace(['', ' '], np.nan, inplace=True)

    with open(f"{start_date}_{end_date}_{suburb}/Cleaned.pkl", 'wb') as f:
        pickle.dump(df, f)
        f.close()
    df.to_csv(f"{start_date}_{end_date}_{suburb}/Cleaned.csv")
    #return df


def reduce(start_date="2012-01-01", end_date=datetime.date.today().strftime(format="%Y-%m-%d"),suburb="Suburb"):
    with open(f"{start_date}_{end_date}_{suburb}/cleaned.pkl", 'rb') as f:
        df = pickle.load(f)
        f.close()

    reduced = pd.DataFrame()
    for i in df.ID.unique():
        reduced = pd.concat([reduced, df[df['Last Updated'] == max(df[df.ID == i]['Last Updated'])]], ignore_index=True)

    reduced.drop_duplicates(inplace=True)

    with open(f"{start_date}_{end_date}_{suburb}/Reduced.pkl", 'wb') as f:
        pickle.dump(reduced, f)
        f.close()
    reduced.to_csv(f"{start_date}_{end_date}_{suburb}/Reduced.csv")
    #return df

In [None]:
scraper(start_date=start_date,end_date=end_date,suburb=suburb)
process(start_date=start_date,end_date=end_date,suburb=suburb)
clean_data(start_date=start_date,end_date=end_date,suburb=suburb)
reduce(start_date=start_date,end_date=end_date,suburb=suburb)

The below command performs a quick exploratory data analysis on the data and writes an HTML report.

In [None]:
ProfileReport(df).to_file('Report.html')