In [25]:
# ============================================================
# 01_data_overview.ipynb â€” Load/Clean + Sanity Check EV Dataset
# Goal: Load raw vehicle data, filter to EV-only rows, run quick sanity checks (fuel type/year/target),
#       and produce a modeling-ready dataframe + (X, y) definitions for the training notebook.
#      (validation + filtering)
# ============================================================

import pandas as pd  # imports pandas lib with nickname pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# read the csv file with panda and save it into df(data frame)
# .. means go up one in directory
df = pd.read_csv("../data/raw/vehicles.csv", low_memory=False)

# pandas may raise DtypeWarnings when columns contain mixed data types
# setting low_memory=False forces a full read and resolves the warning
# this does not change the data, only how pandas parses it

df.shape #check to make sure file loaded & dataset size (#rows,#col)

# df.columns -> list of all column names | col = temp variable for the loop
# loops through each column of list, converts to lower case for check
# if keyword "range" if found returns og string and adds it to a list
df = df.drop(columns=["rangeCityA","rangeHwyA","rangeA"])
[col for col in df.columns if "range" in col.lower()]

# range = EPA combined electric range (primary target)
# City/Highway = specific ranges
# we have removed the alternative data info

# (df["range"] > 0)
# .describe() - describes total value count, top = most true or false, frequency of top
# .any() - true or false tell me if any of the columns containing keyword will have a value greater than 0?
# .sum() - adds up all values that > 0 = # of EVs in dataset

# X = clues | y = answer
y = df["range"]
X = df.drop(columns=["range"])
# y stores target vector(labels)=EPA combined elec range for each EV
# X contains the feature DataFrame(inputs). removes target column

X.dtypes # shows data type for each column

# drops not helpful predictors
drop_cols = ["createdOn", "modifiedOn", "mfrCode", "startStop"]
X = X.drop(columns=[c for c in drop_cols if c in X.columns])

X.shape  # returns (rows, columns) for X
X.head() # displays first 5 rows

X.isna().sum().sort_values(ascending=False).head(10)

# identify columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns # numeric features
cat_cols = X.select_dtypes(include=["object", "bool"]).columns
# catagorical/text features

# compute % of missing values per column & drops those missing >40% of data
missing_ratio = X.isna().mean()
cols_to_drop = missing_ratio[missing_ratio > 0.4].index
X = X.drop(columns=cols_to_drop)

# replace missing numeric values with column median
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# keep only existing categorical columns (prevents KeyError)
cat_cols = [c for c in cat_cols if c in X.columns]

# replacing missing values with unknown
X[cat_cols] = X[cat_cols].fillna("Unknown")
X.isna().sum().sum() # check to confrim no missing values reamin
len(num_cols), len(cat_cols) # num of numeric and catagorial feature columns
cat_cols # shows list of column


# prevent leakage with train/test split
X_train, X_test, y_train, y_test = (train_test_split
(
    X, y,
    test_size=0.2,
    random_state=42
))
X_train.shape, X_test.shape

# build preprocessing pipeline
preprocess = (ColumnTransformer
(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"
))
# handle_unknown="ignore" prevents crashes if test has a category not seen in train.
# sparse_output=False makes it output a normal array

# fit on train & transform train/test
# fit_transform only on train | transform only on test
X_train_model = preprocess.fit_transform(X_train)
X_test_model = preprocess.transform(X_test)

X_train_model.shape, X_test_model.shape


# get final feature names
feature_names = preprocess.get_feature_names_out()
feature_names[:20], len(feature_names)

# check to confirm train/test split sizes (rows, original feature columns)
X_train.shape, X_test.shape
# check after preprocessing/one-hot encoding, features expand to 7590 columns
# (same column count in train/test = preprocessing pipeline is consistent)
X_train_model.shape, X_test_model.shape
# check numb of final feature columns after preprocessing
# should match X_train_model.shape[1]
len(feature_names)

7590