## Ingestion code & exploration for green taxi data

### First try to read a single CSV file from the green taxi data [URL](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green)

In [28]:
url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-10.csv.gz"
df = pd.read_csv(url, nrows=100)

In [29]:
df[:5]

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2020-10-01 00:31:19,2020-10-01 00:34:55,N,1,7,7,1,0.79,5.0,0.5,0.5,1.58,0.0,,0.3,7.88,1,1,0.0
1,2,2020-10-01 00:42:12,2020-10-01 00:43:51,N,1,179,7,1,0.5,4.0,0.5,0.5,0.0,0.0,,0.3,5.3,2,1,0.0
2,2,2020-10-01 00:53:09,2020-10-01 00:55:39,N,1,179,223,1,0.6,4.0,0.5,0.5,1.06,0.0,,0.3,6.36,1,1,0.0
3,1,2020-10-01 00:12:29,2020-10-01 00:20:08,N,1,134,216,2,4.4,13.5,0.5,0.5,0.0,0.0,,0.3,14.8,2,1,0.0
4,1,2020-10-01 00:32:38,2020-10-01 00:43:02,N,1,82,7,1,2.9,10.5,0.5,0.5,0.0,0.0,,0.3,11.8,2,1,0.0


### Now create a function to read based on specific months

In [62]:
def download_and_store_dfs(input_url, months):
    assert type(months) == list
    assert type(input_url) == str
    
    dfs = []
    for month in months:
        url = input_url + "-" + month + ".csv.gz"             # Construct the URL for the CSV file

        df = pd.read_csv(url, compression='gzip', nrows=100)  # Read the CSV file directly from the URL into a DataFrame
        dfs.append(df) 
        
    return dfs

In [65]:
# Test
url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020"
dfs = download_and_store_dfs(url, ["10", "11", "12"])

combined_df = pd.concat(dfs, ignore_index=True)

# Concatenate into a single DataFrame
combined_df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2020-10-01 00:31:19,2020-10-01 00:34:55,N,1,7,7,1,0.79,5.0,0.5,0.5,1.58,0.0,,0.3,7.88,1,1,0.00
1,2,2020-10-01 00:42:12,2020-10-01 00:43:51,N,1,179,7,1,0.50,4.0,0.5,0.5,0.00,0.0,,0.3,5.30,2,1,0.00
2,2,2020-10-01 00:53:09,2020-10-01 00:55:39,N,1,179,223,1,0.60,4.0,0.5,0.5,1.06,0.0,,0.3,6.36,1,1,0.00
3,1,2020-10-01 00:12:29,2020-10-01 00:20:08,N,1,134,216,2,4.40,13.5,0.5,0.5,0.00,0.0,,0.3,14.80,2,1,0.00
4,1,2020-10-01 00:32:38,2020-10-01 00:43:02,N,1,82,7,1,2.90,10.5,0.5,0.5,0.00,0.0,,0.3,11.80,2,1,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,2020-12-01 06:30:01,2020-12-01 06:32:32,N,1,152,152,1,0.50,4.0,0.0,0.5,0.00,0.0,,0.3,4.80,2,1,0.00
296,2,2020-12-01 06:28:27,2020-12-01 07:34:12,N,1,19,197,1,7.41,43.0,0.0,0.5,2.75,0.0,,0.3,46.55,1,1,0.00
297,2,2020-12-01 06:57:47,2020-12-01 07:07:56,N,1,42,168,1,2.02,9.5,0.0,0.5,0.00,0.0,,0.3,10.30,1,1,0.00
298,2,2020-12-01 06:49:51,2020-12-01 06:59:34,N,1,130,191,1,2.23,9.5,0.0,0.5,0.00,0.0,,0.3,10.30,2,1,0.00
