In [1]:

pip install faker 


Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.9 MB ? eta -:--:--
   ---------------- ----------------------- 0.8/1.9 MB 2.7 MB/s eta 0:00:01
   -------------------------- ------------- 1.3/1.9 MB 2.8 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 2.5 MB/s  0:00:00
Installing collected packages: faker
Successfully installed faker-37.5.3
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta


fake = Faker()


cities = [
    {"city": "Washington", "lat": 38.9072, "lon": -77.0369},
    {"city": "Los Angeles", "lat": 34.0522, "lon": -118.2437},
    {"city": "Chicago", "lat": 41.8781, "lon": -87.6298},
    {"city": "Miami", "lat": 25.7617, "lon": -80.1918},
    {"city": "Phoenix", "lat": 33.4484, "lon": -112.0740},
]

start_date = datetime(2019, 1, 1)
end_date = datetime(2023, 12, 31)


date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Function to simulate pollutant values with some seasonal variation
def seasonal_value(base, variation, month):
    seasonal_factor = 1 + 0.2 * np.sin((month - 1) / 12 * 2 * np.pi)  # seasonal cycle
    return round(np.clip(base * seasonal_factor + random.uniform(-variation, variation), 0, None), 2)


records = []

for city in cities:
    for date in date_range:
        month = date.month
        
   
        pm25 = seasonal_value(12, 5, month)
        pm10 = seasonal_value(25, 10, month)
        o3 = seasonal_value(30, 12, month)
        no2 = seasonal_value(20, 8, month)
        so2 = seasonal_value(5, 2, month)
        co = seasonal_value(0.5, 0.2, month)
        
        
        aqi = int(max(pm25, pm10/2, o3, no2) + random.uniform(-5, 5))
        aqi = np.clip(aqi, 0, 500)
        
       
        temp = seasonal_value(15, 10, month)  # °C
        humidity = round(random.uniform(30, 90), 1)  # %
        wind_speed = round(random.uniform(0.5, 10), 1)  # m/s
        
        records.append({
            "date": date.strftime("%Y-%m-%d"),
            "city": city["city"],
            "latitude": city["lat"],
            "longitude": city["lon"],
            "pm25": pm25,
            "pm10": pm10,
            "o3": o3,
            "no2": no2,
            "so2": so2,
            "co": co,
            "aqi": aqi,
            "temperature_c": temp,
            "humidity_percent": humidity,
            "wind_speed_mps": wind_speed,
            "month": month,
            "day": date.day
        })


df = pd.DataFrame(records)
df.to_csv("Air_Quality_Data.csv", index=False)

print(f"Dataset generated: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()



Dataset generated: 9130 rows, 16 columns


Unnamed: 0,date,city,latitude,longitude,pm25,pm10,o3,no2,so2,co,aqi,temperature_c,humidity_percent,wind_speed_mps,month,day
0,2019-01-01,Washington,38.9072,-77.0369,14.78,21.95,18.96,17.32,3.16,0.42,16,19.87,73.6,4.4,1,1
1,2019-01-02,Washington,38.9072,-77.0369,16.93,19.94,23.33,20.76,4.36,0.38,21,23.59,85.3,1.9,1,2
2,2019-01-03,Washington,38.9072,-77.0369,10.49,28.68,35.7,12.64,4.97,0.38,34,10.47,81.5,3.4,1,3
3,2019-01-04,Washington,38.9072,-77.0369,13.26,26.76,37.11,13.01,5.45,0.57,41,20.34,63.2,4.8,1,4
4,2019-01-05,Washington,38.9072,-77.0369,15.19,26.45,18.18,14.92,6.13,0.45,20,15.64,77.4,5.6,1,5


In [8]:
import os 

In [9]:
%pwd

'd:\\SAMITH\\Github\\Air-Quality-Health-Alert-System\\research'

In [10]:
os.chdir("../")

In [11]:
%pwd

'd:\\SAMITH\\Github\\Air-Quality-Health-Alert-System'

In [27]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path 
    source_URL: str 
    local_data_file: Path 
   


In [28]:
from Air_Quality_Health_Alert_System.constants import  *
from Air_Quality_Health_Alert_System.utils.common import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
    

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """Method to get the data ingestion configuration"""
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir= config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
        )
        return data_ingestion_config

In [30]:
import os
import urllib.request as request
from Air_Quality_Health_Alert_System import logger
from Air_Quality_Health_Alert_System.utils.common import get_size

In [31]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self) -> str:
    
        local_path = Path(self.config.local_data_file)

        # Ensure the parent directory exists
        local_path.parent.mkdir(parents=True, exist_ok=True)

        if not local_path.exists():
            try:
                filename, headers = request.urlretrieve(
                    url=self.config.source_URL,
                    filename=str(local_path)
                )
                logger.info(f"{filename} downloaded with headers: \n{headers}")
                return filename
            except Exception as e:
                logger.error(f"Failed to download file: {e}")
                raise
        else:
            logger.info(f"File already exists: {local_path}")
            return str(local_path)

In [32]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()


except Exception as e:
    raise e

[2025-08-14 23:14:54,333: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-08-14 23:14:54,337: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-14 23:14:54,342: INFO: common: created directory at: artifacts]
[2025-08-14 23:14:54,344: INFO: common: created directory at: artifacts/data_ingestion]
[2025-08-14 23:14:54,347: INFO: 3398899755: File already exists: artifacts\data_ingestion\data\Air_Quality_Data.csv]
