## **Advanced Analyitcs and Applications - Data Collection Strategy**

Desciption 

#### Team: 
- Robin Reiners
- Saied Farham Nia

##### **Table of Contents**

0. [Notebook Setup](#Notebook-Set-Up-and-Imports)
1. [Introduction](#Introduction)

7. [References](#References)

##### **Notebook Set Up and Imports**

In [1]:
%%html
<style>
.dataframe th {
    font-family: "JetBrainsMono Nerd Font";
}
.dataframe td {
    font-family: "JetBrainsMono Nerd Font";
}
</style>

In [2]:
import importlib
import os
import pickle
import subprocess
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import yaml

In [3]:
sys.path.append(str(Path.cwd().parent))
from src.utils.notebook_setup import load_files, setup_notebook

style_manager = setup_notebook()

if str(Path().resolve()).split("/")[-1] == "AAA":
    print("already set repo root")
else:
    notebooks_dir = Path().resolve()
    repo_root = notebooks_dir.parent
    config_dir = repo_root / "config"
    data_dir = repo_root / "data"
    results_dir = data_dir / "results"
    raw_data_dir = data_dir / "raw"
    processed_data_dir = data_dir / "processed"

    with open(config_dir / "config.yaml", "r") as file:
        config = yaml.safe_load(file)
    
    os.chdir(repo_root)

## Introduction
[Back to Table of Contents](#Table-of-Contents)

In [4]:
from src.api import taxi
importlib.reload(taxi)

<module 'src.api.taxi' from '/workspaces/AAA/src/api/taxi.py'>

In [11]:
from src.api.taxi import ChicagoTaxiAPI

api = ChicagoTaxiAPI(
    app_token=config["CHICAGO_TAXI"].get("APP_TOKEN", None)
)

df_sample = api.fetch_data(
    select = (
        "trip_id, taxi_id, trip_start_timestamp, trip_end_timestamp, trip_seconds, "
        "trip_miles, pickup_census_tract, dropoff_census_tract, pickup_community_area, "
        "dropoff_community_area, fare, tips, tolls, extras, trip_total, payment_type, "
        "company, pickup_centroid_location, dropoff_centroid_location"
    ),
    where=(
        "pickup_centroid_location IS NOT NULL "
        "AND dropoff_centroid_location IS NOT NULL "
        "AND trip_start_timestamp >= trip_end_timestamp "
        "AND trip_start_timestamp IS NOT NULL"
    ), 
    order="trip_start_timestamp DESC",
    limit=300_000,
)

api.close()

In [None]:
df_batch_sample = api.fetch_batch_data(
    select = (
        "trip_id, taxi_id, trip_start_timestamp, trip_end_timestamp, trip_seconds, "
        "trip_miles, pickup_census_tract, dropoff_census_tract, pickup_community_area, "
        "dropoff_community_area, fare, tips, tolls, extras, trip_total, payment_type, "
        "company, pickup_centroid_location, dropoff_centroid_location"
    ),
    where=(
        "pickup_centroid_location IS NOT NULL "
        "AND dropoff_centroid_location IS NOT NULL "
        "AND trip_start_timestamp >= trip_end_timestamp "
        "AND trip_start_timestamp IS NOT NULL"
    ), 
    output_dir=raw_data_dir
)

Estimated total records to fetch: 1917889


Fetching all data:  89%|████████▊ | 1700000/1917889 [04:52<00:36, 5960.09rec/s]

In [None]:
from src.api.weather import ChicagoWeatherAPI
from datetime import date

weather_api = ChicagoWeatherAPI()

try:
    # --- Example 1: Get historical weather for a few days ---
    print("\n--- Example 1: Fetching historical weather for last week ---")
    today = date.today()
    seven_days_ago = today - pd.Timedelta(days=7)
    one_day_ago = today - pd.Timedelta(days=1)

    historical_df = weather_api.get_historical_weather(
        start_date=seven_days_ago.strftime("%Y-%m-%d"),
        end_date=one_day_ago.strftime("%Y-%m-%d"), # Open-Meteo historical is up to 2 days ago
        hourly_vars=["temperature_2m", "precipitation", "weather_code", "wind_speed_10m"],
        daily_vars=["temperature_2m_max", "temperature_2m_min", "precipitation_sum"] # request both
    )

    if historical_df is not None:
        print(f"Fetched {len(historical_df)} hourly historical records:")
        print(historical_df.head())
        # If you want to see the daily data, you'd need to adapt the method to return it
        # or make a separate call specifically for daily data if the API structures it separately
        # For Open-Meteo, if you request both hourly and daily, the hourly data is typically primary in the response structure.
        # The current implementation prioritizes hourly.
        # Let's try fetching only daily for a clearer example:
        daily_historical_df = weather_api.get_historical_weather(
            start_date=seven_days_ago.strftime("%Y-%m-%d"),
            end_date=one_day_ago.strftime("%Y-%m-%d"),
            hourly_vars=[], # No hourly
            daily_vars=["temperature_2m_max", "temperature_2m_min", "precipitation_sum", "weather_code"]
        )
        if daily_historical_df is not None:
            print(f"\nFetched {len(daily_historical_df)} daily historical records:")
            print(daily_historical_df.head())


    else:
        print("Failed to fetch historical weather data.")

    # --- Example 2: Get 3-day weather forecast ---
    print("\n--- Example 2: Fetching 3-day weather forecast ---")
    forecast_df = weather_api.get_forecast_weather(
        days=3,
        hourly_vars=["temperature_2m", "apparent_temperature", "precipitation_probability"],
        daily_vars=["sunrise", "sunset", "uv_index_max"]
    )
    if forecast_df is not None:
        print(f"Fetched {len(forecast_df)} hourly forecast records:")
        print(forecast_df.head())
        # Similar to historical, current implementation prioritizes hourly for return.
        # You could modify it to return both daily and hourly if needed, e.g., as a dict of DataFrames.
    else:
        print("Failed to fetch forecast weather data.")
        
    # --- Example 3: Get specific historical variable (e.g., only daily max temp) ---
    print("\n--- Example 3: Fetching only daily max temperature for 5 days ---")
    five_days_ago = today - pd.Timedelta(days=5)
    max_temp_df = weather_api.get_historical_weather(
        start_date=five_days_ago.strftime("%Y-%m-%d"),
        end_date=one_day_ago.strftime("%Y-%m-%d"),
        hourly_vars=[], # Explicitly no hourly
        daily_vars=["temperature_2m_max"]
    )
    if max_temp_df is not None:
        print(f"Fetched daily max temperatures:")
        print(max_temp_df)
    else:
        print("Failed to fetch daily max temperatures.")


finally:
    weather_api.close()