# New York Taxi ETL

##### Import Spark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType, DoubleType, IntegerType
from pyspark.sql.functions import lit
from pyspark.sql.functions import current_timestamp
from pyspark.sql import Row
from pyspark.sql import functions as F

### Construct Spark Session

In [None]:
#Start SparkSession with azure hadoop package 
spark = SparkSession.builder.master('local').appName('app').config('spark.jars.packages', 'org.apache.hadoop:hadoop-azure:3.3.1').getOrCreate()        
spark.conf.set("fs.azure.account.key.springboardstorage.blob.core.windows.net",{azure_key})

In [None]:
# Create mount mount to connect to azure blob
# ...Use this once or enter into try / expect block
try:
    dbutils.fs.mount(source = "wasbs://springboardcontainer@springboardstorage.blob.core.windows.net",
    mount_point = "/mnt/taxi_etl",
    extra_configs = {"fs.azure.account.key.springboardstorage.blob.core.windows.net": {azure_key}})
# How to pass in java.lang.IllegalArgumentException?
except Exception as e:
    print(e)

An error occurred while calling o442.mount.
: java.rmi.RemoteException: java.lang.IllegalArgumentException: requirement failed: Directory already mounted: /mnt/taxi_etl; nested exception is: 
	java.lang.IllegalArgumentException: requirement failed: Directory already mounted: /mnt/taxi_etl
	at com.databricks.backend.daemon.data.client.DbfsClient.send0(DbfsClient.scala:135)
	at com.databricks.backend.daemon.data.client.DbfsClient.sendIdempotent(DbfsClient.scala:69)
	at com.databricks.backend.daemon.dbutils.DBUtilsCore.createOrUpdateMount(DBUtilsCore.scala:1025)
	at com.databricks.backend.daemon.dbutils.DBUtilsCore.$anonfun$mount$1(DBUtilsCore.scala:1051)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:555)
	at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:650)
	at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:671)
	at com.databricks.logging.UsageLogging.$ano

In [None]:
# View files in SpringBoard Container
dbutils.fs.ls("/mnt/taxi_etl")

Out[14]: [FileInfo(path='dbfs:/mnt/taxi_etl/__MACOSX/', name='__MACOSX/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/taxi_etl/chromedriver', name='chromedriver', size=14452880, modificationTime=1677008615000),
 FileInfo(path='dbfs:/mnt/taxi_etl/combined_trade_and_quote/', name='combined_trade_and_quote/', size=0, modificationTime=1688771082000),
 FileInfo(path='dbfs:/mnt/taxi_etl/data/', name='data/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/taxi_etl/deltalake/', name='deltalake/', size=0, modificationTime=1686166290000),
 FileInfo(path='dbfs:/mnt/taxi_etl/taxi_data_logs/', name='taxi_data_logs/', size=0, modificationTime=1690845652000),
 FileInfo(path='dbfs:/mnt/taxi_etl/test_weather_data/', name='test_weather_data/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/taxi_etl/trip_data/', name='trip_data/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/taxi_etl/weather_data/', name='weather_data/', size=0, modificationTime=1678395045000),
 FileI

#### Grab Taxi Data

##### Find Dates

In [None]:
# Example URLs
# https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2009-01.parquet
# https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet
# https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2022-01.parquet
# https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2022-01.parquet

import pandas as pd
import requests
import os
import datetime

def get_existing_files():
    # Get filenames for existing files in the blob storage
    file_list = []
    files = dbutils.fs.ls("/mnt/taxi_etl/trip_data/")
    for x in range(len(files)):
        file = files[x][0].split('/')[-1]
        file_list.append(file)
    return file_list

def get_dates():
    # Get date range for last 3 months
    current_date = datetime.datetime.now()
    end_date = current_date.replace(day=1).strftime('%Y-%m-%d')
    last_month = current_date.replace(day=1)-datetime.timedelta(days=1)
    start_date = last_month.replace(month=last_month.month-6,day=1).strftime('%Y-%m-%d')
    yellow_dates = pd.date_range(start_date,end_date,freq='MS').strftime("%Y-%m").to_list()
    return yellow_dates

def remove_existing():
    # Compare potential new files to existing files
    new = get_dates()
    new_files = []
    for x in range(len(new)):
        new_files.append('yellow_'+new[x]+'.parquet')
    existing = get_existing_files()
    # Remove existing items
    to_add = [item for item in new_files if item not in existing]
    dates_to_add = []
    for x in range(len(to_add)):
        dates_to_add.append(to_add[x].split('_')[-1].split('.')[0])
    return dates_to_add

remove_existing()

yellow_dates = remove_existing()
print(yellow_dates)

['2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06', '2023-07', '2023-08']


##### Functions To Download and Store Data

In [None]:
class ScrapeNyTaxi:
    '''
    Functions to loop thru select dates from remove_existing() and download each parquet file in that range for yellow taxi data
    '''
    def grab_yellow():
        # For Logging
        start_time = datetime.datetime.now()
        start_time_str = start_time.isoformat()
        # Download yellow cab parquet files 
        for date in yellow_dates:
            try:
                url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{date}.parquet'
                response = requests.get(url, allow_redirects=True)               
                open(f'/dbfs/mnt/taxi_etl/trip_data/yellow_{date}.parquet','wb').write(response.content)
                # Get Time Following Write
                end_time = datetime.datetime.now()
                end_time_str = end_time.isoformat()
                # Open File and Read Num Rows
                parquet_file_path = f'/mnt/taxi_etl/trip_data/yellow_{date}.parquet'
                # If Bad Data Pull...Remove File with No Data but keep Log
                path = '/dbfs/mnt/taxi_etl/trip_data/'
                onlyfiles = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]                
                for file in onlyfiles:
                    if os.path.getsize(path+file) < 250:
                        os.remove(path+file)
                if os.path.exists(path+f'yellow_{date}.parquet'):
                    df = spark.read.parquet(parquet_file_path)
                    num_rows_ingested = df.count()
                else:
                    num_rows_ingested = 0
                # Logging File Names
                file_name = f'yellow_{date}.parquet'
                file_write_name = f'yellow_{date}.json'
                # Log Data
                data = {"StartTime":start_time_str,"EndTime":end_time_str,"RowsIngested":num_rows_ingested,"FileName":file_name}
                data_json = json.dumps(data)
                # Create the 'taxi_data_logs' directory if it doesn't exist
                logs_dir = '/dbfs/mnt/taxi_etl/taxi_data_logs/'
                if not os.path.exists(logs_dir):
                    os.makedirs(logs_dir)
                log_file_path = f'{logs_dir}{file_write_name}'
                open(log_file_path, 'w').write(data_json)

            except Exception as e:
                print(f'Exception: {e}')
                print(f'Could not write {file_name}')
                continue

In [None]:
ScrapeNyTaxi.grab_yellow()

Exception: An error occurred while calling o13933.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 188.0 failed 4 times, most recent failure: Lost task 0.3 in stage 188.0 (TID 520) (10.139.64.4 executor 0): org.apache.spark.SparkException: Exception thrown in awaitResult: Could not read footer for file: FileStatus{path=dbfs:/mnt/taxi_etl/trip_data/yellow_2023-08.parquet; isDirectory=false; length=255; replication=0; blocksize=0; modification_time=0; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false; hasAcl=false; isEncrypted=false; isErasureCoded=false}.
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:472)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:552)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:890)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInPar

##### Remove Empty files

In [None]:
import os
path = '/dbfs/mnt/taxi_etl/trip_data/'
onlyfiles = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
# Total number of files
print(f'Total: {len(onlyfiles)} files')

Total: 396 files


In [None]:
# Several files are empty, as there was no data to pull from the web
for file in onlyfiles:
    if os.path.getsize(path+file) < 250:
      print(f'{file} has no data')
      os.remove(path+file)

yellow_2023-06.parquet has no data
yellow_2023-07.parquet has no data
yellow_2023-08.parquet has no data


In [None]:
onlyfiles = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
# Total number of files
print(f'Total: {len(onlyfiles)} files')

Total: 396 files


##### Print Size of Files

In [None]:
yellow_bytes = 0
for file in onlyfiles:
    if file.startswith('yellow'):
        yellow_bytes += os.path.getsize(path+file)
print(f'yellow taxi data totals {yellow_bytes/1000000000} gigs')

yellow taxi data totals 18.526986873 gigs


#### Explore Taxi Data

##### Schemas

In [None]:
directory = '/dbfs/mnt/taxi_etl/trip_data/'

yellow_taxi = []
for file in os.listdir(directory):
    if file.startswith('yellow'):
        yellow_taxi.append(file)

all_data = [yellow_taxi]

In [None]:
def get_schema(data):
    # Read schema for each file and write to a .json file
    file_list = []
    schema_list = []

    for files in data:
        df = spark.read.option('inferSchema','true').format('parquet').load(directory[5:]+files)
        file_list.append(files)
        schema_list.append(str(df.dtypes))

    list_zip = zip(file_list,schema_list)
    zipped_list = list(list_zip)

    df_schema = StructType([ \
        StructField("File",StringType(),True), \
        StructField("Schema",StringType(),True),
    ]) 

    df = spark.createDataFrame(zipped_list,schema= df_schema)
    df = df.groupBy("Schema").agg(F.collect_list('File'))
    
    data_str = data[0]
    name = data_str.split(' ')[0]
    
    df.write.json(f'/dbfs/mnt/taxi_etl/trip_data/{name}_schema.json')
    df.show()

In [None]:
# Run schema finder for yellow data
'''
for data in all_data:
    get_schema(data)
'''

Out[24]: '\nfor data in all_data:\n    get_schema(data)\n'

#### Normalize Schemas

##### Define Schemas

In [None]:
# Define Schemas for each type of data 
yellow_schema = StructType([
    StructField('VendorID', LongType(), True),
    StructField('pickup_datetime', TimestampType(), True),
    StructField('dropoff_datetime', TimestampType(), True),
    StructField('passenger_count', StringType(), True),
    StructField('trip_distance', DoubleType(), True),
    StructField('RatecodeID', LongType(), True),
    StructField('store_and_fwd_flag', StringType(), True),
    StructField('PULocationID', LongType(), True),
    StructField('DOLocationID', LongType(), True),
    StructField('payment_type', LongType(), True),
    StructField('fare_amount', DoubleType(), True),
    StructField('extra', DoubleType(), True),
    StructField('mta_tax', DoubleType(), True),
    StructField('tip_amount', DoubleType(), True),
    StructField('tolls_amount', DoubleType(), True),
    StructField('improvement_surcharge', DoubleType(), True),
    StructField('total_amount', DoubleType(), True),
    StructField('congestion_surcharge', DoubleType(), True),
    StructField('airport_fee', IntegerType(), True),
    StructField('taxi_type', StringType(), True),
    ])

##### Cast Each Group's Schema and Union to Itself

In [None]:
# Script to make yellow schema
def make_yellow():

    emptyRDD = spark.sparkContext.emptyRDD()
    yellow_df = spark.createDataFrame(emptyRDD,schema=yellow_schema)

    yellow_list = []

    for file in os.listdir(directory):
        if file.startswith('yellow'):
            yellow_list.append(file)    

    for file in yellow_list:
        df_yellow = spark.read.option('inferSchema','true').parquet(f'{directory[5:]}{file}')
        df_yellow = df_yellow.withColumn('taxi_type',lit('yellow'))
        df_yellow = df_yellow.withColumnRenamed('tpep_pickup_datetime','pickup_datetime')\
            .withColumnRenamed('tpep_dropoff_datetime','dropoff_datetime')

        df_yellow.createOrReplaceTempView('Cast')

        df_yellow = spark.sql("SELECT BIGINT(VendorID),TIMESTAMP(pickup_datetime),\
            TIMESTAMP(dropoff_datetime),DOUBLE(passenger_count),DOUBLE(trip_distance),\
            BIGINT(RatecodeID),STRING(store_and_fwd_flag),BIGINT(PULocationID),BIGINT(DOLocationID),\
            BIGINT(payment_type),DOUBLE(fare_amount),DOUBLE(extra),DOUBLE(mta_tax),DOUBLE(tip_amount),\
            DOUBLE(tolls_amount),DOUBLE(improvement_surcharge),DOUBLE(total_amount),DOUBLE(congestion_surcharge),\
            DOUBLE(airport_fee),STRING(taxi_type) from Cast")

        yellow_df = df_yellow.union(yellow_df)
        print(f'{file} analyzed')

    yellow_df.printSchema()

    return yellow_df


In [None]:
# Test read of dataframe
def read_data():
    # Test read all yellow data
    yellow_df = spark.read.format('parquet').load(directory[5:]+'yellow*.parquet')
    yellow_df.printSchema()
read_data()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



#### Grab New York Weather Data

##### Import Packages

In [None]:
%pip install selenium

Python interpreter will be restarted.
Python interpreter will be restarted.


In [None]:
import datetime
from bs4 import BeautifulSoup as BS
from selenium import webdriver
import pandas as pd
import time



##### Install Chrome and ChromeDriver

In [None]:
dbutils.fs.mkdirs("dbfs:/databricks/scripts/")
dbutils.fs.put("/databricks/scripts/selenium-install.sh","""
#!/bin/bash
%sh
LAST_VERSION="https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2FLAST_CHANGE?alt=media"
VERSION=$(curl -s -S $LAST_VERSION)
if [ -d $VERSION ] ; then
  echo "version already installed"
  exit
fi
 
rm -rf /tmp/chrome/$VERSION
mkdir -p /tmp/chrome/$VERSION
 
URL="https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F$VERSION%2Fchrome-linux.zip?alt=media"
ZIP="${VERSION}-chrome-linux.zip"
 
curl -# $URL > /tmp/chrome/$ZIP
unzip /tmp/chrome/$ZIP -d /tmp/chrome/$VERSION
 
URL="https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F$VERSION%2Fchromedriver_linux64.zip?alt=media"
ZIP="${VERSION}-chromedriver_linux64.zip"
 
curl -# $URL > /tmp/chrome/$ZIP
unzip /tmp/chrome/$ZIP -d /tmp/chrome/$VERSION
 
mkdir -p /tmp/chrome/chrome-user-data-dir
 
rm -f /tmp/chrome/latest
ln -s /tmp/chrome/$VERSION /tmp/chrome/latest
 
# to avoid errors about missing libraries
sudo apt-get update
sudo apt-get install -y libgbm-dev
""", True)
display(dbutils.fs.ls("dbfs:/databricks/scripts/"))

Wrote 1045 bytes.


path,name,size,modificationTime
dbfs:/databricks/scripts/selenium-install.sh,selenium-install.sh,1045,1690915264000


In [None]:
%sh
/dbfs/databricks/scripts/selenium-install.sh

/dbfs/databricks/scripts/selenium-install.sh: line 3: fg: no job control

                                                                           0.0%
#                                                                          2.5%
######                                                                     8.8%
###########                                                               15.5%
###############                                                           22.1%
####################                                                      28.8%
#########################                                                 35.6%
##############################                                            42.5%
###################################                                       49.5%
########################################                                  56.6%
############################################                              62.2%
##############################################

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
s = Service('/tmp/chrome/latest/chromedriver_linux64/chromedriver')
options = webdriver.ChromeOptions()
options.binary_location = "/tmp/chrome/latest/chrome-linux/chrome"
options.add_argument('headless')
options.add_argument('--disable-infobars')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--remote-debugging-port=9222')
options.add_argument('--homedir=/tmp/chrome/chrome-user-data-dir')
options.add_argument('--user-data-dir=/tmp/chrome/chrome-user-data-dir')
prefs = {"download.default_directory":"/tmp/chrome/chrome-user-data-di",
         "download.prompt_for_download":False
}
options.add_experimental_option("prefs",prefs)
# driver = webdriver.Chrome(service=s, options=options)

##### Load Wunderground Data

In [None]:
# Create Log Directory
dbutils.fs.mkdirs("/mnt/taxi_etl/weather_data_logs/test")

Out[5]: True

In [None]:
# Get Last Three Months of Weather Data
def get_existing_files():
    # Get filenames for existing files in the blob storage
    file_list = []
    files = dbutils.fs.ls("/mnt/taxi_etl/weather_data/")
    for x in range(len(files)):
        file = files[x][0].split('/')[-1][10:20]
        file_list.append(file)
    return file_list

def last_three_months():
    from datetime import date, timedelta
    # Get dates for last three months
    current_date = date.today()
    three_months_ago = current_date - timedelta(days=6
                                                *30)  # Approximating 30 days per month
    date_list = []
    while current_date >= three_months_ago:
        date_list.append(current_date.strftime('%Y-%m-%d'))
        current_date -= timedelta(days=1)
    return date_list

def get_dates():
    existing = get_existing_files()
    new_dates = last_three_months()
    to_add = [item for item in new_dates if item not in existing]
    return to_add

In [None]:
# function to load wunderground data (without this it has no records to show)
def render_page(url):
    driver = webdriver.Chrome(service=s, options=options)    
    driver.get(url)
    time.sleep(3)
    r = driver.page_source
    driver.quit()
    return r

In [None]:
def list_transpose(data_list):
    res_list = [[item.replace('%', '') for item in lst] for lst in data_list]
    res_list = [[item.replace(u'\xa0', u'') for item in lst] for lst in res_list]
    res_list = [[item.replace('°F','') for item in lst] for lst in res_list]
    res_list = [[item.replace('°in','') for item in lst] for lst in res_list]
    res_list = [[item.replace('°%','') for item in lst] for lst in res_list]
    res_list = [[item.replace('°mph','') for item in lst] for lst in res_list]
    final_list = [[item.replace('°','') for item in lst] for lst in res_list]
    return final_list

In [None]:
def set_schema(df):
    # To Interger
    df[["Temperature","Dew_Point", "Humidity","Wind_Speed","Wind_Gust"]] = df[["Temperature","Dew_Point", "Humidity","Wind_Speed","Wind_Gust"]].apply(pd.to_numeric)
    df[['Pressure','Precipitation']] = df[['Pressure','Precipitation']].apply(pd.to_numeric)
    # To DateTime
    df['datetime'] = df['datetime'].apply(pd.to_datetime)
    # To String
    df[['Wind','Condition']] = df[['Wind','Condition']].applymap(str)
    return df

In [None]:
from datetime import datetime
import json

def scraper(page, dates):
    '''function to scrape wunderground'''
    for d in dates:
        start_time = datetime.now()

        url = str(str(page) + str(d))
        print(url)
        r = render_page(url)

        soup = BS(r, "html.parser")
        container = soup.find('lib-city-history-observation')
        check = container.find('tbody')

        data = []
        try:
            for c in check.find_all('tr', class_='ng-star-inserted'):
                for i in c.find_all('td', class_='ng-star-inserted'):
                    trial = i.text
                    trial = trial.strip('  ')
                    data.append(trial)
            
            df_daily = []
            cols = ['Time','Temperature','Dew_Point','Humidity','Wind','Wind_Speed','Wind_Gust','Pressure','Precipitation','Condition','Date']
            for i in range(0,len(data),10):
                snip_data = []
                snip_data.append(data[i:i+10])
                # Strip of Weird Characters
                snip_data = list_transpose(snip_data)
                snip_data[0].append(d)
                df = pd.DataFrame(snip_data,columns=cols)
                df['datetime'] = df['Date'] + ' ' + df['Time']
                df = df.drop(['Date','Time'],axis=1) 
                # Set Schema
                df = set_schema(df)
                df_daily.append(df)

            df_daily = pd.concat(df_daily)
            num_rows_ingested = len(df_daily.index)
            df_daily.to_parquet(f'/dbfs/mnt/taxi_etl/weather_data/NY_Weather{d}.parquet')

            end_time = datetime.now()

            # Logging JSON
            file_name = f'NY_Weather{d}.parquet'
            file_write_name = f'NY_Weather{d}.json'
            start_time_str = start_time.isoformat()
            end_time_str = end_time.isoformat()
            data = {"StartTime":start_time_str,"EndTime":end_time_str,"RowsIngested":num_rows_ingested,"FileName":file_name}
            data_json = json.dumps(data)
            open(f'/dbfs/mnt/taxi_etl/weather_data_logs/{file_write_name}','w').write(data_json)
            
        except AttributeError:
            continue

# Call Functions
dates = get_dates()
page = 'https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/'

scraper(page, dates)

https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-27
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-26
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-25
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-24
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-23
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-22
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-21
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-20
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-19
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-18
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/2023-02-17
https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/202

In [None]:
%sh du -h /dbfs/mnt/taxi_etl/weather_data/

In [None]:
%sh ls /dbfs/mnt/taxi_etl/weather_data/ | wc -l

In [None]:
# Make Directory
dbutils.fs.mkdirs("/mnt/taxi_etl/weather_data/combined_df")

In [None]:
# Combine weather data to check that schema is readable for all files
def combine_weather_dfs():
    path = '/mnt/taxi_etl/weather_data/*.parquet'
    df = spark.read.option('inferSchema','true').parquet(path)
    df = df.drop('__index_level_0__')
    df = df.withColumnRenamed("Temperature", "temp(f)")\
       .withColumnRenamed("Dew_Point", "dew_point(f)")\
       .withColumnRenamed("Humidity", "humidity(%)")\
       .withColumnRenamed("Wind", "wind_direction")\
       .withColumnRenamed("Wind_Speed", "wind_speed(mph)")\
       .withColumnRenamed("Wind_Gust", "wind_gust(mph)")\
       .withColumnRenamed("Pressure", "pressure(inHg)")\
       .withColumnRenamed("Precipitation", "precipitation(in)")\
       .withColumnRenamed("Condition", "condition")
    df.printSchema()
    df.show()
combine_weather_dfs()

#### DeltaLake Analysis

##### Convert parquet to Delta

In [None]:
# Read All Parquet Files from deltalake/parquet/
path = '/mnt/taxi_etl/deltalake/parquet/*.parquet'
df = spark.read.option('inferSchema','true').parquet(path)

In [None]:
# Print Schema
df.printSchema()

In [None]:
# Save dataframe to delta
delta_path = '/mnt/taxi_etl/deltalake/delta'
df.coalesce(1).write.format('delta').mode('overwrite').save(delta_path)

In [None]:
# See files
dbutils.fs.ls(delta_path)

In [None]:
# View each log individually
log_directory = '/mnt/taxi_etl/deltalake/delta/_delta_log/'
files = dbutils.fs.ls(log_directory)
filenames = [file.name for file in files if file.name.endswith('.json')]
for filename in filenames:
    display(spark.read.text(log_directory+filename))

##### Analyze Taxi Data

In [None]:
# Key metrics can be visualized and data can be examined in the Data Profile. 

In [None]:
%sql
SELECT *
FROM delta.`/mnt/taxi_etl/deltalake/delta/`

In [None]:
# More summary statistics can be found by using describe()

In [None]:
file_path = '/mnt/taxi_etl/deltalake/delta/'
df = spark.read.format("delta").load(file_path)
df.describe().show()