<a href="https://colab.research.google.com/github/serega-sergei/SpringBoard_ML/blob/main/Siarhei_Siryk_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mini Project: Build a Machine Learning Model**

Predict Total Fare on the NYC Taxi Dataset
Welcome to the NYC Taxi Fare Prediction project! In this Colab, we will continue using the NYC Taxi Dataset to predict the fare amount for taxi rides using a subset of available features. We will go through three main stages: building a baseline model, creating a full model, and performing hyperparameter tuning to enhance our predictions.

Now that you've completed exploratory data analysis on this dataset you should have a good understanding of the feature space.

# **Project Objectives**
The primary objectives of this project are as follows:

Baseline Model: We will start by building a simple baseline model to establish a benchmark for our predictions. This model will serve as a starting point to compare the performance of our subsequent models.

Full Model: Next, we will develop a more comprehensive model that leverages machine learning techniques to improve prediction accuracy. We will use Scikit-Learn's model pipeline to build a framework that enables rapid experimentation.

Hyperparameter Tuning: Lastly, we will optimize our full model by fine-tuning its hyperparameters. By systematically adjusting the parameters that control model behavior, we aim to achieve the best possible performance for our prediction task.

https://github.com/springboard-curriculum/guild-mle-projects/blob/main/Student_MLE_MiniProject_ML.ipynb

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Load the dataset into a pandas DataFrame (from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
# url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"

# df = pd.read_parquet(url)

from google.colab import files

# Upload the file
uploaded = files.upload()

file_name = list(uploaded.keys())[0]  # Get the name of the uploaded file
df = pd.read_parquet(file_name)

# Check the first few rows of the dataframe
df.head()

In [None]:
# Display the first few rows of the dataset
print(df.head(5))

In [None]:
# Drop rows with missing values.
df_clean = df.dropna()

In [None]:
# Create new feature, 'trip_duration'.
pickup_col = 'tpep_pickup_datetime'
dropoff_col = 'tpep_dropoff_datetime'

df_clean.loc[:, pickup_col] = pd.to_datetime(df_clean[pickup_col])
df_clean.loc[:, dropoff_col] = pd.to_datetime(df_clean[dropoff_col])

df_clean.loc[:, 'trip_duration'] = (df_clean[dropoff_col] - df_clean[pickup_col]).dt.total_seconds()

print(df_clean[[pickup_col, dropoff_col, 'trip_duration']].head())

In [None]:
# Create a list called feature_col to store column names
target_col = 'total_amount'
feature_col = [col for col in df_clean.columns if col != target_col]

print(feature_col)

In [None]:
# Split dataset into training and test sets
from sklearn.model_selection import train_test_split

X = df_clean[feature_col]
y = df_clean['total_amount']

# Split into train and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Create a baseline for mean absolute error of total amount
from sklearn.metrics import mean_absolute_error

# Define the target column
target_col = 'total_amount'

# Use the mean of the training target as the baseline prediction
baseline_pred = y_train.mean()

# Create an array of baseline predictions (same length as y_test)
baseline_preds = [baseline_pred] * len(y_test)

# Calculate the Mean Absolute Error
baseline_mae = mean_absolute_error(y_test, baseline_preds)

print(f"Baseline MAE (predicting mean {target_col}): {baseline_mae:.2f}")

In [None]:
# Use Scikit-Learn's ColumnTransformer to preprocess the categorical and
# continuous features independently.
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_features = [
    'VendorID', 'RatecodeID', 'store_and_fwd_flag',
    'PULocationID', 'DOLocationID', 'payment_type'
]

numerical_features = [
    'passenger_count', 'trip_distance', 'trip_duration', 'fare_amount',
    'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
    'improvement_surcharge', 'congestion_surcharge', 'airport_fee'
]

# Define preprocessing steps
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

# Combine into ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
# Create a pipeline object containing the column transformations and regression
# model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Liner Regression MAE: {mae:.2f}")

In [None]:
# Build random forest regressor model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# n_jobs=-1 - use all available cores
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# will be using trip_duration instead due to TypeError: float() argument must be a string or a real number, not 'Timestamp'
feature_col = [col for col in df_clean.columns if col != 'tpep_pickup_datetime' and col != 'tpep_dropoff_datetime']

# Ensure store_and_fwd_flag is numeric
df_clean['store_and_fwd_flag'] = df_clean['store_and_fwd_flag'].map({'N': 0, 'Y': 1})

X = df_clean[feature_col]
y = df_clean['total_amount']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf_model.fit(X_train, y_train)

# Predict
rf_y_pred = rf_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, rf_y_pred)
r2 = r2_score(y_test, rf_y_pred)

print(f"Random Forest MAE: {mae:.2f}")
print(f"Random Forest R²: {r2:.2f}")

In [None]:
# Define the hyperparameters to tune.
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [10, 20, 30, None],  # Depth of the tree
    'min_samples_split': [2, 5, 10]  # Minimum samples required to split an internal node
}

In [None]:
# Perform grid search to find the best hyperparameters. This could take a while.
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Step 3: Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
# Get the best model and its parameters.
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

best_rf_model = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                                      max_depth=best_params['max_depth'],
                                      min_samples_split=best_params['min_samples_split'],
                                      random_state=42, n_jobs=-1)

In [None]:
# Fit the best classifier on the training data.
best_rf_model.fit(X_train, y_train)


In [None]:
# Make predictions on the test data
y_pred = best_rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest MAE: {mae:.2f}")
print(f"Random Forest R²: {r2:.2f}")