In [2]:
import pandas as pd
import sqlite3
import logging
from datetime import datetime
import requests

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
def extract_data():
    try:
        df = pd.read_csv('Program3_Traffic_Volume.csv')
        logging.info("Data extracted successfully")
        return df
    except Exception as e:
        logging.error(f"Extraction failed: {e}")
        return None
    

In [5]:
def transform_data(df):
    if df is None:
        logging.error("No data to transform")
        return None
    try:
        required_columns=['date_time','weather_main','traffic_volume','weather_description']
        if not all(col in df.columns for col in required_columns):
            logging.error(f"Required columns {required_columns} not found. Available columns: {list(df.columns)}")
            return None

        df = df.rename(columns={'Unnamed: 0':'sensor_id',
                            'traffic_volume':'vehicle_count',
                            'temp':'speed_kmh',
                            'date_time':'timestamp',
                            'weather_main':'location'})

        if df['sensor_id'].isnull().any():
            logging.warning('Null sensor id found; dropping rows')
            df = df.dropna(subset=['sensor_id'])

        invalid_speed = (df['speed_kmh'] < 0) | (df['speed_kmh'] > 200)
        if invalid_speed.any():
            logging.warning(f"Invalid speeds found in {invalid_speed.sum()} rows; setting to median")
            df.loc[invalid_speed, 'speed_kmh'] = df['speed_kmh'].median()

        df['speed_mph'] = df['speed_kmh'] * 0.621371
        df['timestamp'] = df['timestamp'].fillna(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        df['location'] = df['location'].str.title()
        df['processed_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        logging.info("Data transformed successfully")
        return df
    except Exception as e:
        logging.error(f"Transformation failed: {e}")
        return None


In [6]:
def load_data(df, db_name="traffic.db", table_name="traffic"):
    if df is None:
        logging.error("No data to load")
        return
    try:
        conn = sqlite3.connect(db_name)
        df.to_sql(table_name, conn, if_exists="append", index=False)
        conn.close()
        logging.info(f"Data loaded to {db_name} in table {table_name}")
    except Exception as e:
        logging.error(f"Loading failed: {e}")

In [7]:
def verify_data(db_name="traffic.db", table_name="traffic"):
    try:
        conn = sqlite3.connect(db_name)
        loaded_df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
        conn.close()
        logging.info("Data verification completed")
        return loaded_df
    except Exception as e:
        logging.error(f"Verification failed: {e}")
        return None


In [8]:
logging.info("Starting ETL pipeline")
extracted_df = extract_data()
print("Extracted Data:\n", extracted_df)

transformed_df = transform_data(extracted_df)
print("\nTransformed Data:\n", transformed_df)

load_data(transformed_df)
verified_df = verify_data()
print("\nLoaded Data from Database:\n", verified_df)

2025-09-11 10:37:53,446 - INFO - Starting ETL pipeline
2025-09-11 10:37:53,460 - INFO - Data extracted successfully
2025-09-11 10:37:53,469 - INFO - Data transformed successfully
2025-09-11 10:37:53,488 - INFO - Data loaded to traffic.db in table traffic
2025-09-11 10:37:53,524 - INFO - Data verification completed


Extracted Data:
       Unnamed: 0        holiday    temp  rain_1h  snow_1h  clouds_all  \
0          40255  New Years Day  249.36      0.0      0.0           1   
1          40256            NaN  249.08      0.0      0.0           1   
2          40257            NaN  248.86      0.0      0.0           1   
3          40258            NaN  248.72      0.0      0.0           1   
4          40259            NaN  248.43      0.0      0.0           1   
...          ...            ...     ...      ...      ...         ...   
7944       48199            NaN  283.45      0.0      0.0          75   
7945       48200            NaN  282.76      0.0      0.0          90   
7946       48201            NaN  282.73      0.0      0.0          90   
7947       48202            NaN  282.09      0.0      0.0          90   
7948       48203            NaN  282.12      0.0      0.0          90   

      weather_main     weather_description            date_time  year  month  \
0            Clear        