# Creating a pipeline for chosen model

Basically running everyting again after evaluating different models, but this time in a more robust way to ensure a clean, reproducable pipeline.

For this, I got help from Google, other repos and ChatGPT to learn a little bit about SimpleImputer, OneHotEncoder and Pipeline.

In [1]:
from taxipred.utils.constants import CLEANED_DATA

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

### Load clean data

In [2]:
df = pd.read_csv(CLEANED_DATA/"taxi_prices_cleaned.csv")

### Separate features and target

In [3]:
X, y = df.drop(columns="trip_price"), df["trip_price"]

### Train | Test-split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Picking numerical and categorical features

In [5]:
num_features = X_train.select_dtypes(include="number").columns
cat_features = X_train.select_dtypes(include="object").columns

#### Deciding which stragegy in "dummy encoding" the two different features should have

In [6]:
num_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
cat_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore", drop="first"))])

In [None]:
preprocess = ColumnTransformer(
    transformers=[("num", num_transformer, num_features), ("cat", cat_transformer, cat_features)]
)

In [None]:
model = RandomForestRegressor(random_state=42)