In [4]:
# 01_data_quality_check.ipynb
#importing required libraries/modules
import os
import pandas as pd

In [None]:
#Creating File Path
DATA_DIR = os.path.join("..", "data", "raw")
FILE_NAME = "Banglore_traffic_dataset.csv"  


file_path = os.path.join(DATA_DIR, FILE_NAME)
print("Loading file from:", file_path)

#Error Handling
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at: {file_path}")

df = pd.read_csv(file_path)
print("Shape (rows, columns):", df.shape)
df.head()

In [None]:
# – High-level summary & missing values 

# Summary stats for numeric columns

df.describe().T  

# Missing value percentage per column

missing_summary = (
    df.isna()    #boolean values are cast to True=>1 and False => 0. mean is calculated to find total missing values per total actual values.
      .mean()
      .sort_values(ascending=False)
      .to_frame(name="missing_pct")
)

missing_summary["missing_pct"] = (missing_summary["missing_pct"] * 100).round(2)
missing_summary

In [None]:
#– Duplicate checks
# Total duplicate rows
total_dupes = df.duplicated().sum()

print(f"Total fully-duplicated rows: {total_dupes}")

dup_pct = (total_dupes / len(df) * 100) if len(df) > 0 else 0 
print(f"Duplicate row percentage: {dup_pct:.2f}%")

In [None]:
#Basic value sanity (numeric columns)
numeric_cols=df.select_dtypes(include=['int64','float64']).columns.to_list()

dq_numeric = []

for col in numeric_cols:
    series = df[col]
    dq_numeric.append({
        "column": col,
        "dtype": str(series.dtype),
        "min": series.min(),
        "q1": series.quantile(0.25),
        "median": series.median(),
        "q3": series.quantile(0.75),
        "max": series.max(),
        "n_missing": series.isna().sum()
    })
pd.DataFrame(dq_numeric)

In [None]:
#Creating a Configuration Dictionary of columns(grouping columns of similar data type)

COLUMNS = {
    "timestamp": "Date",         
    "speed": "Average Speed",             
    "volume": "Traffic Volume",      
    "location": ['Area Name', 'Road/Intersection Name'],          
    "lat": "latitude",                
    "lon": "longitude",              
}
COLUMNS

In [None]:
#– Parse timestamps 
time_col = COLUMNS.get("timestamp", None)  #having a fallback value 'None'

if time_col and time_col in df.columns:
    df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
    
 # Check how many became NaT (failed parsing)
    bad_ts = df[time_col].isna().sum()
    print(f"Failed to parse {bad_ts} timestamps out of {len(df)} rows.")
    
#     # Extract useful time parts
    df["hour"] = df[time_col].dt.hour
    df["dayofweek"] = df[time_col].dt.dayofweek
    df["date"] = df[time_col].dt.date
else:
    print("No valid timestamp column configured. Update COLUMNS['timestamp'].")

In [None]:
#– Check speed & volume sanity
speed_col = COLUMNS.get("speed", None)
vol_col = COLUMNS.get("volume", None)

if speed_col and speed_col in df.columns:
    print("Speed distribution summary:")
    print(df[speed_col].describe())
    
    # Possible anomaly checks 
    n_negative_speed = (df[speed_col] < 0).sum()
    n_too_high_speed = (df[speed_col] > 150).sum()   # assuming >150 km/h is unrealistic
    
    print(f"\nNegative speed values: {n_negative_speed}")
    print(f"Speed >150 km/h: {n_too_high_speed}")
else:
    print("Speed column not configured / not found.")


if vol_col and vol_col in df.columns:
    print("\nVolume distribution summary:")
    print(df[vol_col].describe())
    
    n_negative_vol = (df[vol_col] < 0).sum()
    print(f"\nNegative traffic volume values: {n_negative_vol}")
else:
    print("Volume column not configured / not found.")

In [None]:
# Geo sanity
lat_col = COLUMNS.get("lat", None)
lon_col = COLUMNS.get("lon", None)

if lat_col in df.columns and lon_col in df.columns:
    print("Latitude summary:")
    print(df[lat_col].describe())

    print("\nLongitude summary:")
    print(df[lon_col].describe())
    
    # Bengaluru rough bounds (approx)
    lat_min, lat_max = 12.7, 13.2
    lon_min, lon_max = 77.4, 77.9
    
    out_of_bounds = df[ (df[lat_col] < lat_min) | (df[lat_col] > lat_max) |(df[lon_col] < lon_min) | (df[lon_col] > lon_max)]
    print(f"\nPoints outside Bengaluru bounding box: {len(out_of_bounds)}")
else:
    print("No lat/lon columns configured. Skipping geo sanity checks.")