# 1. Download data

In [1]:
import requests
from pathlib import Path

def dowload_one_file_of_raw_data(year:int, month: int) -> Path:
    URL = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet'
    response = requests.get(URL)
    
    if response.status_code == 200:
        path = f'../data/raw/rides_{year}-{month:02d}.parquet'
        open(path,'wb').write(response.content)
        return path
    else:
        raise Exception(f'{URL} is not avaliable')

In [2]:
dowload_one_file_of_raw_data(2022,1)

'../data/raw/rides_2022-01.parquet'

# 2. Load Data

In [5]:
import pandas as pd

data = pd.read_parquet('../data/raw/rides_2022-01.parquet')
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [None]:
rides = data[['tpep_pickup_datetime','PULocationID']]
rides.head()

Unnamed: 0,tpep_pickup_datetime,PULocationID
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68


In [None]:
rides.rename(columns={'tpep_pickup_datetime':'pickup_datetime',
                      'PULocationID':'pickup_location_id'},inplace=True)
rides

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides.rename(columns={'tpep_pickup_datetime':'pickup_datetime',


Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68
...,...,...
2463926,2022-01-31 23:36:53,90
2463927,2022-01-31 23:44:22,107
2463928,2022-01-31 23:39:00,113
2463929,2022-01-31 23:36:42,148


# 3. Validation Data

In [10]:
rides['pickup_datetime'].describe()

  rides['pickup_datetime'].describe()


count                 2463931
unique                1423522
top       2022-01-26 07:57:00
freq                       12
first     2008-12-31 22:23:09
last      2022-05-18 20:41:57
Name: pickup_datetime, dtype: object

In [11]:
## Remove date out the ranges
rides = rides[rides['pickup_datetime']>= '2022-01-01']
rides = rides[rides['pickup_datetime']< '2022-02-01']
rides['pickup_datetime'].describe()

  rides['pickup_datetime'].describe()


count                 2463879
unique                1423471
top       2022-01-26 07:57:00
freq                       12
first     2022-01-01 00:00:08
last      2022-01-31 23:59:58
Name: pickup_datetime, dtype: object

In [12]:
rides.to_parquet('../data/transformed/validated_rides_2022_01.parquet')