In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import os

In [9]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

In [11]:
path = kagglehub.dataset_download("khushikyad001/urban-noise-levels")

Downloading from https://www.kaggle.com/api/v1/datasets/download/khushikyad001/urban-noise-levels?dataset_version_number=1...


100%|██████████| 170k/170k [00:00<00:00, 186kB/s]

Extracting files...





In [15]:
csv_path = os.path.join(path, "urban_noise_levels.csv")

In [16]:
df = pd.read_csv(csv_path)

In [17]:
df.shape

(2000, 26)

In [18]:
df.head()

Unnamed: 0,id,latitude,longitude,datetime,decibel_level,hour,day_of_week,is_weekend,temperature_c,humidity_%,...,population_density,park_proximity,industrial_zone,vehicle_count,honking_events,public_event,holiday,school_zone,noise_complaints,sensor_id
0,1,40.649816,-74.069147,2023-04-01 18:50:00,78.286408,18,5,1,16.710979,43.054747,...,28384,0,0,24,1,0,0,1,1,31
1,2,40.880286,-74.076511,2023-03-18 04:12:00,68.131845,4,5,1,10.088749,29.676246,...,24206,0,0,15,2,0,0,0,0,40
2,3,40.792798,-73.746873,2023-12-03 05:43:00,58.934966,5,6,1,19.911075,47.932832,...,1894,1,0,15,2,0,0,0,0,41
3,4,40.739463,-74.075227,2023-06-20 00:44:00,69.559042,0,1,0,3.47752,88.59401,...,2265,1,0,24,4,0,0,1,2,50
4,5,40.562407,-74.064025,2023-12-28 12:06:00,60.409097,12,3,0,17.356303,35.811699,...,23807,1,0,25,2,0,0,0,0,8


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  2000 non-null   int64  
 1   latitude            2000 non-null   float64
 2   longitude           2000 non-null   float64
 3   datetime            2000 non-null   object 
 4   decibel_level       2000 non-null   float64
 5   hour                2000 non-null   int64  
 6   day_of_week         2000 non-null   int64  
 7   is_weekend          2000 non-null   int64  
 8   temperature_c       2000 non-null   float64
 9   humidity_%          2000 non-null   float64
 10  wind_speed_kmh      2000 non-null   float64
 11  precipitation_mm    2000 non-null   float64
 12  traffic_density     2000 non-null   int64  
 13  near_airport        2000 non-null   int64  
 14  near_highway        2000 non-null   int64  
 15  near_construction   2000 non-null   int64  
 16  popula

In [21]:
missing = df.isnull().sum().sort_values(ascending=False)
missing

id                    0
latitude              0
longitude             0
datetime              0
decibel_level         0
hour                  0
day_of_week           0
is_weekend            0
temperature_c         0
humidity_%            0
wind_speed_kmh        0
precipitation_mm      0
traffic_density       0
near_airport          0
near_highway          0
near_construction     0
population_density    0
park_proximity        0
industrial_zone       0
vehicle_count         0
honking_events        0
public_event          0
holiday               0
school_zone           0
noise_complaints      0
sensor_id             0
dtype: int64

In [22]:
df2 = df.copy()

In [None]:
if "datetime" in df2.columns:
    df2['datetime'] = pd.to_datetime(df2['datetime'], errors='coerce')  
    print("datetime nulls after parse:", df2['datetime'].isnull().sum())

datetime nulls after parse: 0


In [24]:
int_cols = ['hour', 'day_of_week', 'is_weekend']
for c in int_cols:
    if c in df2.columns:
        df2[c] = pd.to_numeric(df2[c], errors='coerce').astype('Int64')

In [None]:
dups = df2.duplicated().sum()
print("Duplicate rows (exact):", dups)

In [25]:
if 'latitude' in df2.columns and 'longitude' in df2.columns:
    bad_lat = (~df2['latitude'].between(-90, 90)) | df2['latitude'].isnull()
    bad_lon = (~df2['longitude'].between(-180, 180)) | df2['longitude'].isnull()
    print("Bad latitude count:", bad_lat.sum(), "Bad longitude count:", bad_lon.sum())

Bad latitude count: 0 Bad longitude count: 0


In [26]:
if 'decibel_level' in df2.columns:
    print("decibel min/max:", df2['decibel_level'].min(), df2['decibel_level'].max())
    bad_db = (~df2['decibel_level'].between(0, 200)) | df2['decibel_level'].isnull()
    print("Bad decibel count:", bad_db.sum())

decibel min/max: 33.23296186803999 97.43092969594731
Bad decibel count: 0


In [27]:
for col, (lo, hi) in {
    'temperature_c': (-50, 60),
    'humidity_%': (0, 100),
    'wind_speed_kmh': (0, 200)
}.items():
    if col in df2.columns:
        print(f"{col} out-of-range count:", (~df2[col].between(lo, hi)).sum())

temperature_c out-of-range count: 0
humidity_% out-of-range count: 0
wind_speed_kmh out-of-range count: 0


In [28]:
critical_mask = (
    df2['datetime'].isnull() |
    (~df2['latitude'].between(-90,90)) |
    (~df2['longitude'].between(-180,180)) |
    (~df2['decibel_level'].between(0,200))
)
print("Rows with at least one critical problem:", critical_mask.sum())

Rows with at least one critical problem: 0


In [29]:
frac_critical = critical_mask.mean()
print("Fraction critical:", frac_critical)
if frac_critical < 0.02:
    df_clean = df2[~critical_mask].copy()
    print("Dropped rows with critical issues -> new shape:", df_clean.shape)
else:
    df_clean = df2.copy()

Fraction critical: 0.0
Dropped rows with critical issues -> new shape: (2000, 26)


In [30]:
num_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df_clean.select_dtypes(include=['object','category']).columns.tolist()

In [31]:
for c in num_cols:
    if df_clean[c].isnull().any():
        df_clean[c] = df_clean[c].fillna(df_clean[c].median())

for c in cat_cols:
    if df_clean[c].isnull().any():
        df_clean[c] = df_clean[c].fillna("unknown")

print("Remaining nulls:", df_clean.isnull().sum().sum())

Remaining nulls: 0
