#### Download files from url

- https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
- Download 2023's Yellow Taxi trip records (parquet files)

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re

import pandas as pd

import matplotlib.pyplot as plt

pd.options.display.float_format = '{:.2f}'.format #Supress scientific notation in Pandas

project_path = '/Users/reshma/AI/MLOPS Project/taxi_demand_predictor/'
os.chdir(project_path)
os.getcwd()

'/Users/reshma/AI/MLOPS Project/taxi_demand_predictor'

## 1. Data collection

In [2]:
def download_files_raw(url,raw_path,year,month):
    
    resp_url = requests.get(url)
    if resp_url.status_code == 200 :
        soup = BeautifulSoup(resp_url.text,"html.parser")
        yellow_a_tags = soup.find_all(title="Yellow Taxi Trip Records",href=re.compile(year))
        files = []
        for link in yellow_a_tags:
            yellow_href = link.get('href')
            #Getting an extra %20 at the end of file name due to space in the original a tag on the website
            yellow_href = yellow_href.strip() 
            resp_file = requests.get(yellow_href)
            file_name = resp_file.url.split('/')[4]
            file_name = file_name.replace('yellow_tripdata','rides')
            os.chdir(raw_path)
            if month=="all" or month==int(a.split('-')[1].split('.')[0]):
                with open(file_name,'wb') as f:
                    files.append(file_name)
                    f.write(resp_file.content)
        return files
    else :
        return "Website not found"

In [3]:
url = 'https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page'
raw_path = project_path + 'data/raw'
raw_files = download_files_raw(url,raw_path,year="2023",month="all")
raw_files

['rides_2023-01.parquet',
 'rides_2023-02.parquet',
 'rides_2023-03.parquet',
 'rides_2023-04.parquet',
 'rides_2023-05.parquet',
 'rides_2023-06.parquet',
 'rides_2023-07.parquet',
 'rides_2023-08.parquet',
 'rides_2023-09.parquet']

## 2. Data Validation

In [4]:
def validate_data_files(raw_files,raw_path,validated_path):
    validated_files = []
    for f in raw_files :
        print("### Original file = ",f)
        f_path = raw_path + '/' + f
        f_y_m = f.split('_')[1].split('.')[0].split('-')
        f_y_m = [int(val) for val in f_y_m]
        df = pd.read_parquet(path = f_path)
        print("Original Data : ",df.shape)
        
        df = df[['tpep_pickup_datetime','PULocationID']]
        df.columns = ['pickup_time','pickup_location']  

        # Retain only rows that have correct year and month as per data file
        df['pickup_year'] = df['pickup_time'].dt.year
        df['pickup_month'] = df['pickup_time'].dt.month
        df = df[df['pickup_year'] == f_y_m[0]]
        df = df[df['pickup_month'] == f_y_m[1]]

        df = df[['pickup_time','pickup_location']]
        print("Validated Data : ",df.shape)
        
        display(df['pickup_time'].describe(datetime_is_numeric=True))
        validated_files.append(f)
        val_path = validated_path + f
        df.to_parquet(path=val_path) #compression='snappy', index=None
        
    #Delete raw files
    for f in raw_files:
        os.remove(raw_path + '/' + f)
    
    return validated_files

In [5]:
validated_path = project_path + 'data/validated/'
validated_files = validate_data_files(raw_files,raw_path,validated_path)
validated_files

### Original file =  rides_2023-01.parquet
Original Data :  (3066766, 19)
Validated Data :  (3066718, 2)


count                          3066718
mean     2023-01-17 00:27:48.391111680
min                2023-01-01 00:00:00
25%      2023-01-09 16:22:12.249999872
50%         2023-01-17 08:42:40.500000
75%                2023-01-24 16:26:28
max                2023-01-31 23:59:59
Name: pickup_time, dtype: object

### Original file =  rides_2023-02.parquet
Original Data :  (2913955, 19)
Validated Data :  (2913900, 2)


count                          2913900
mean     2023-02-15 00:50:29.105288448
min                2023-02-01 00:00:00
25%                2023-02-08 08:43:12
50%                2023-02-14 22:33:13
75%                2023-02-22 08:33:59
max                2023-02-28 23:59:59
Name: pickup_time, dtype: object

### Original file =  rides_2023-03.parquet
Original Data :  (3403766, 19)
Validated Data :  (3403577, 2)


count                          3403577
mean     2023-03-16 12:04:09.711691776
min                2023-03-01 00:00:00
25%                2023-03-08 18:48:21
50%                2023-03-16 12:18:37
75%                2023-03-24 08:55:02
max                2023-03-31 23:59:59
Name: pickup_time, dtype: object

### Original file =  rides_2023-04.parquet
Original Data :  (3288250, 19)
Validated Data :  (3288155, 2)


count                       3288155
mean     2023-04-16 08:02:08.040672
min             2023-04-01 00:00:00
25%             2023-04-08 18:29:11
50%             2023-04-16 10:47:44
75%             2023-04-23 16:24:33
max             2023-04-30 23:59:57
Name: pickup_time, dtype: object

### Original file =  rides_2023-05.parquet
Original Data :  (3513649, 19)
Validated Data :  (3513599, 2)


count                          3513599
mean     2023-05-16 05:26:10.975884544
min                2023-05-01 00:00:06
25%         2023-05-08 18:57:59.500000
50%                2023-05-16 09:27:21
75%         2023-05-23 14:57:14.500000
max                2023-05-31 23:59:56
Name: pickup_time, dtype: object

### Original file =  rides_2023-06.parquet
Original Data :  (3307234, 19)
Validated Data :  (3307193, 2)


count                          3307193
mean     2023-06-15 18:27:04.859210496
min                2023-06-01 00:00:00
25%                2023-06-08 09:12:52
50%                2023-06-15 14:15:27
75%                2023-06-23 02:25:03
max                2023-06-30 23:59:59
Name: pickup_time, dtype: object

### Original file =  rides_2023-07.parquet
Original Data :  (2907108, 19)
Validated Data :  (2907049, 2)


count                          2907049
mean     2023-07-17 07:12:31.895691520
min                2023-07-01 00:00:01
25%                2023-07-10 09:39:56
50%                2023-07-17 15:12:29
75%                2023-07-24 18:17:25
max                2023-07-31 23:59:55
Name: pickup_time, dtype: object

### Original file =  rides_2023-08.parquet
Original Data :  (2824209, 19)
Validated Data :  (2824175, 2)


count                          2824175
mean     2023-08-16 08:30:48.118145024
min                2023-08-01 00:00:00
25%                2023-08-08 16:26:17
50%                2023-08-16 09:53:25
75%         2023-08-24 07:07:35.500000
max                2023-08-31 23:59:59
Name: pickup_time, dtype: object

### Original file =  rides_2023-09.parquet
Original Data :  (2846722, 19)
Validated Data :  (2846690, 2)


count                          2846690
mean     2023-09-15 14:06:49.598458880
min                2023-09-01 00:00:00
25%                2023-09-08 12:29:51
50%         2023-09-14 15:46:29.500000
75%      2023-09-21 08:22:15.750000128
max                2023-09-30 23:59:59
Name: pickup_time, dtype: object

['rides_2023-01.parquet',
 'rides_2023-02.parquet',
 'rides_2023-03.parquet',
 'rides_2023-04.parquet',
 'rides_2023-05.parquet',
 'rides_2023-06.parquet',
 'rides_2023-07.parquet',
 'rides_2023-08.parquet',
 'rides_2023-09.parquet']