### Import Libraries and Data

In [1]:
# Basic imports

import sys
from pathlib import Path
import os
import warnings

warnings.filterwarnings("ignore")

project_root = Path().resolve().parent
sys.path.append(str(project_root))

/Users/riyanshibohra/Documents/GitHub/metropolitan-climate-profiling/notebooks


In [2]:
os.chdir('/Users/riyanshibohra/Documents/GitHub/metropolitan-climate-profiling')
print(os.getcwd())  # Verify the change

/Users/riyanshibohra/Documents/GitHub/metropolitan-climate-profiling


In [4]:
# Import necessary modules

from scripts.data_loader import load_raw_dataset, save_dataset
from scripts.preprocessing import (
    convert_to_datetime,
    filter_columns,
    convert_numeric_columns,
    impute_missing_values,
    extract_time_features,
    remove_outliers
)

In [5]:
# Define dataset paths
data_folder = Path().resolve() / "data"
dallas_path = data_folder / "Dallas.csv"
arlington_path = data_folder / "Arlington.csv"
denton_path = data_folder / "Denton.csv"

In [6]:
# Define columns of interest
columns_of_interest = [
    'DATE', 'HourlyDryBulbTemperature', 'HourlyWetBulbTemperature',
    'HourlyDewPointTemperature', 'HourlyRelativeHumidity',
    'HourlyPrecipitation', 'HourlySeaLevelPressure',
    'HourlyStationPressure', 'HourlyWindSpeed', 'HourlyWindDirection'
]

In [7]:

# Load raw datasets
print("Loading raw datasets...")
dallas = load_raw_dataset(dallas_path, columns_of_interest, date_column='DATE')
arlington = load_raw_dataset(arlington_path, columns_of_interest, date_column='DATE')
denton = load_raw_dataset(denton_path, columns_of_interest, date_column='DATE')

Loading raw datasets...


### Data Cleaning and Transformation


In [8]:
# Convert numeric columns
print("Converting numeric columns...")
columns_to_convert = columns_of_interest[1:]  # Exclude 'DATE'
dallas = convert_numeric_columns(dallas, columns_to_convert)
arlington = convert_numeric_columns(arlington, columns_to_convert)
denton = convert_numeric_columns(denton, columns_to_convert)

Converting numeric columns...


In [9]:
# Impute missing values
print("Imputing missing values...")
dallas = impute_missing_values(dallas, columns_to_convert)
arlington = impute_missing_values(arlington, columns_to_convert)
denton = impute_missing_values(denton, columns_to_convert)

Imputing missing values...


In [10]:
# Extract time-based features
print("Extracting time-based features...")
dallas = extract_time_features(dallas, date_column='DATE')
arlington = extract_time_features(arlington, date_column='DATE')
denton = extract_time_features(denton, date_column='DATE')

Extracting time-based features...


In [11]:

# Remove outliers
print("Removing outliers...")
dallas = remove_outliers(dallas, column='HourlyDryBulbTemperature')
arlington = remove_outliers(arlington, column='HourlyDryBulbTemperature')
denton = remove_outliers(denton, column='HourlyDryBulbTemperature')

Removing outliers...


### Data Export


In [12]:
# Save processed data
print("Saving processed datasets...")
save_dataset(dallas, data_folder / "processed_dallas.csv")
save_dataset(arlington, data_folder / "processed_arlington.csv")
save_dataset(denton, data_folder / "processed_denton.csv")

Saving processed datasets...


### End of Notebook

In [13]:
print("Preprocessing complete!")

Preprocessing complete!
