In [2]:
import os
import pandas as pd



## **Descarga de datos**

In [2]:
def download_data(url, filename):
    """Downloads data from a URL and saves it to a file."""
    print(f"Downloading {url} to {filename}...")
    try:
        urllib.request.urlretrieve(url, filename)
        print(f"✅ Downloaded {filename}")
    except Exception as e:
        print(f"❌ Error downloading {filename}: {e}")
        raise

Datos de duración de viajes entre puntos en NYC

In [3]:
jan_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet"
feb_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet"

In [4]:
output_path = "data/processed"
data_path = "data"

In [5]:
os.makedirs(output_path, exist_ok=True)

In [6]:
download_data(jan_url, os.path.join(data_path, "jan.parquet"))
download_data(feb_url, os.path.join(data_path, "feb.parquet"))

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet to data\jan.parquet...
✅ Downloaded data\jan.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet to data\feb.parquet...
✅ Downloaded data\feb.parquet


## **Carga de datos en pandas**

In [7]:
df_jan = pd.read_parquet(os.path.join(data_path, "jan.parquet"))
df_feb = pd.read_parquet(os.path.join(data_path, "feb.parquet"))

In [8]:
df_jan.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.9,1.0,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.7,1.0,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.0
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.0,7.2,1.0,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.0
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.3,6.5,0.5,1.5,1.7,0.0,,1.0,10.2,1.0,1.0,0.0
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.1,6.0,0.5,1.5,0.0,0.0,,1.0,8.0,1.0,1.0,0.0


In [9]:
def duration_trip(df):
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    return df

In [10]:
df_jan = duration_trip(df_jan)
df_feb = duration_trip(df_feb)

In [11]:
df_jan.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.9,...,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75,11.016667
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.7,...,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.0,6.766667
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.0,7.2,...,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.0,6.333333
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.3,6.5,...,1.5,1.7,0.0,,1.0,10.2,1.0,1.0,0.0,5.816667
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.1,6.0,...,1.5,0.0,0.0,,1.0,8.0,1.0,1.0,0.0,5.966667


In [12]:
df_jan.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'duration'],
      dtype='object')

In [13]:
df_jan.trip_type.unique()

array([ 1.,  2., nan])

In [14]:
df_jan.drop(columns=['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'VendorID'], inplace=True)
df_feb.drop(columns=['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'VendorID'], inplace=True)

In [15]:
# save processed data
df_jan.to_parquet("data/processed/jan.parquet")
df_feb.to_parquet("data/processed/feb.parquet")