In [3]:
import io
import pandas as pd
import requests

def load_data_from_api(year, month):
    """
    Template for loading data from API
    """
    url = f'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_{year}-{month}.csv.gz'
    print(url)

    taxi_dtypes = {
                    'VendorID': pd.Int64Dtype(),
                    'store_and_fwd_flag':str,
                    'RatecodeID':pd.Int64Dtype(),
                    'PULocationID':pd.Int64Dtype(),
                    'DOLocationID':pd.Int64Dtype(),
                    'passenger_count': pd.Int64Dtype(),
                    'trip_distance': float,
                    'fare_amount': float,
                    'extra':float,
                    'mta_tax':float,
                    'tip_amount':float,
                    'tolls_amount':float,
                    'ehail_fee':float,
                    'improvement_surcharge':float,
                    'total_amount':float,
                    'payment_type': pd.Int64Dtype(),
                    'trip_type': pd.Int64Dtype(),
                    'congestion_surcharge':float
                }

    # native date parsing 
    parse_dates = ['lpep_pickup_datetime', 'lpep_dropoff_datetime']

    return pd.read_csv(
        url, sep=',', compression='gzip', dtype=taxi_dtypes, parse_dates=parse_dates
        )


def download_data():
    
    year = 2020
    months = [10, 11, 12]
    dfs = []

    for month in months:

        df = load_data_from_api(month=month, year=year)
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

In [4]:
df = download_data()
df.head()

https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-10.csv.gz
https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-11.csv.gz
https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-12.csv.gz


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2020-10-01 00:31:19,2020-10-01 00:34:55,N,1,7,7,1,0.79,5.0,0.5,0.5,1.58,0.0,,0.3,7.88,1,1,0.0
1,2,2020-10-01 00:42:12,2020-10-01 00:43:51,N,1,179,7,1,0.5,4.0,0.5,0.5,0.0,0.0,,0.3,5.3,2,1,0.0
2,2,2020-10-01 00:53:09,2020-10-01 00:55:39,N,1,179,223,1,0.6,4.0,0.5,0.5,1.06,0.0,,0.3,6.36,1,1,0.0
3,1,2020-10-01 00:12:29,2020-10-01 00:20:08,N,1,134,216,2,4.4,13.5,0.5,0.5,0.0,0.0,,0.3,14.8,2,1,0.0
4,1,2020-10-01 00:32:38,2020-10-01 00:43:02,N,1,82,7,1,2.9,10.5,0.5,0.5,0.0,0.0,,0.3,11.8,2,1,0.0


In [4]:
import requests
import gzip
import io
import pandas as pd
from io import BytesIO

dfs = []
# URL of the zipped CSV file
for month in [10, 11, 12]:
    url = f'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-{month}.csv.gz'

    # Send an HTTP GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Decompress the gzip file
        with gzip.open(BytesIO(response.content)) as gzip_file:
            # Read the CSV file into a DataFrame
            df = pd.read_csv(gzip_file)
            # Append the DataFrame to the list
            dfs.append(df)
    else:
        print(f'Failed to download the file from {url}')
            

# Concatenate all DataFrames in the list into one DataFrame
final_df = pd.concat(dfs, ignore_index=True)

# Now you can work with the final DataFrame 'final_df'
print(final_df.head())

  df = pd.read_csv(gzip_file)
  df = pd.read_csv(gzip_file)


   VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag  \
0       2.0  2020-10-01 00:31:19   2020-10-01 00:34:55                  N   
1       2.0  2020-10-01 00:42:12   2020-10-01 00:43:51                  N   
2       2.0  2020-10-01 00:53:09   2020-10-01 00:55:39                  N   
3       1.0  2020-10-01 00:12:29   2020-10-01 00:20:08                  N   
4       1.0  2020-10-01 00:32:38   2020-10-01 00:43:02                  N   

   RatecodeID  PULocationID  DOLocationID  passenger_count  trip_distance  \
0         1.0             7             7              1.0           0.79   
1         1.0           179             7              1.0           0.50   
2         1.0           179           223              1.0           0.60   
3         1.0           134           216              2.0           4.40   
4         1.0            82             7              1.0           2.90   

   fare_amount  extra  mta_tax  tip_amount  tolls_amount  ehail_fee  \
0  

  df = pd.read_csv(gzip_file)


In [10]:
df = final_df.copy()

In [6]:
print(final_df.dtypes)

VendorID                 float64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID               float64
PULocationID               int64
DOLocationID               int64
passenger_count          float64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type             float64
trip_type                float64
congestion_surcharge     float64
dtype: object


In [11]:
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
df['lpep_pickup_date'] = df['lpep_pickup_datetime'].dt.date

In [12]:
df.head(2)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,lpep_pickup_date
0,2.0,2020-10-01 00:31:19,2020-10-01 00:34:55,N,1.0,7,7,1.0,0.79,5.0,...,0.5,1.58,0.0,,0.3,7.88,1.0,1.0,0.0,2020-10-01
1,2.0,2020-10-01 00:42:12,2020-10-01 00:43:51,N,1.0,179,7,1.0,0.5,4.0,...,0.5,0.0,0.0,,0.3,5.3,2.0,1.0,0.0,2020-10-01


In [17]:
df['lpep_pickup_date'].nunique()

96