In [1]:
import pandas as pd
import os
import requests
from tqdm import tqdm

def download_nyc_taxi_data(year: int, month: int, types=None):
    if types is None:
        types = ['yellow', 'green', 'fhv', 'fhvhv']
    
    base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
    local_dir = "nyc_taxi_all"
    os.makedirs(local_dir, exist_ok=True)
    dfs = []

    for t in types:
        fname = f"{t}_tripdata_{year}-{month:02d}.parquet"
        url = base_url + fname
        local_path = os.path.join(local_dir, fname)

        # Скачиваем
        if not os.path.exists(local_path):
            print(f"Скачиваем {fname}...")
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                with open(local_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
            else:
                print(f"[!] Пропущен {t}: HTTP {response.status_code}")
                continue

        # Загружаем
        try:
            df = pd.read_parquet(local_path)
            df['source_type'] = t  # Добавляем метку источника
            dfs.append(df)
        except Exception as e:
            print(f"[!] Ошибка чтения {fname}: {e}")

    if dfs:
        all_df = pd.concat(dfs, ignore_index=True)
        print(f"Всего строк загружено: {len(all_df):,}")
        return all_df
    else:
        print("Данные не загружены.")
        return pd.DataFrame()

In [2]:
df_all = download_nyc_taxi_data(2022, 1)
df_all.head()

Скачиваем yellow_tripdata_2022-01.parquet...
Скачиваем green_tripdata_2022-01.parquet...
Скачиваем fhv_tripdata_2022-01.parquet...
Скачиваем fhvhv_tripdata_2022-01.parquet...
Всего строк загружено: 18,421,708


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tolls,bcf,sales_tax,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,1.0,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142.0,236.0,1.0,...,,,,,,,,,,
1,1.0,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236.0,42.0,1.0,...,,,,,,,,,,
2,2.0,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166.0,166.0,1.0,...,,,,,,,,,,
3,2.0,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114.0,68.0,2.0,...,,,,,,,,,,
4,2.0,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68.0,163.0,1.0,...,,,,,,,,,,


In [1]:
!pip install kaggle


Collecting kaggle
  Downloading kaggle-1.7.4.5-py3-none-any.whl.metadata (16 kB)
Collecting bleach (from kaggle)
  Downloading bleach-6.3.0-py3-none-any.whl.metadata (31 kB)
Collecting protobuf (from kaggle)
  Downloading protobuf-6.33.2-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting python-slugify (from kaggle)
  Using cached python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode (from kaggle)
  Using cached text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Collecting webencodings (from kaggle)
  Using cached webencodings-0.5.1-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading kaggle-1.7.4.5-py3-none-any.whl (181 kB)
Downloading bleach-6.3.0-py3-none-any.whl (164 kB)
Downloading protobuf-6.33.2-cp39-abi3-macosx_10_9_universal2.whl (427 kB)
Using cached python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Using cached text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Using cached webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
In

In [4]:

# Указать нужный датасет (историческая погода)
!kaggle datasets download -d selfishgene/historical-hourly-weather-data -p weather_data --unzip

/usr/bin/sh: 1: kaggle: not found
