In [5]:
# @title Setup
from google.cloud import bigquery
from google.colab import data_table
import bigframes.pandas as bpd
import pandas as pd

project = 'possible-coast-463614-i9' # Project ID inserted based on the query results selected to explore
location = 'US' # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project, location=location)
data_table.enable_dataframe_formatter()

In [6]:
# Function to execute a BigQuery query and return a DataFrame

def query_to_dataframe(query: str) -> pd.DataFrame:
    """
    Executes a SQL query in BigQuery and returns a Pandas DataFrame.

    Parameters:
    - query (str): The SQL query to execute.

    Return:
    - pd.DataFrame : The DataFrame containing the results of the query.
    """
    try:
        df = client.query(query).to_dataframe()
        print(f"Query executed successfully. Retrieved {df.shape[0]} rows.")
        return df
    except Exception as e:
        print(f"Error executing query: {e}")
        return pd.DataFrame()

In [7]:
query_trips_ml_data = """
SELECT *
FROM `possible-coast-463614-i9.ml_dataset.trips_ml_data`
"""
trips_ml_data_df = query_to_dataframe(query_trips_ml_data)
trips_ml_data_df.head()

Query executed successfully. Retrieved 3188447 rows.


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,source_file
0,2,2025-05-29 22:20:42+00:00,2025-05-29 22:20:47+00:00,1.0,0.07,5.0,N,14,14,2,0.01,0.0,0.0,0.0,0.0,1.0,1.01,0.0,0.0,yellow_tripdata_2025-05.parquet
1,2,2025-05-16 18:06:57+00:00,2025-05-16 18:18:03+00:00,1.0,0.01,1.0,N,141,141,2,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,yellow_tripdata_2025-05.parquet
2,2,2025-05-08 00:04:36+00:00,2025-05-08 00:04:46+00:00,1.0,0.11,1.0,N,186,186,2,0.0,0.0,0.0,0.0,0.0,0.0,3.25,2.5,0.0,yellow_tripdata_2025-05.parquet
3,2,2025-05-26 03:59:05+00:00,2025-05-26 04:11:01+00:00,1.0,0.26,1.0,N,230,230,2,0.0,0.0,0.0,0.0,0.0,0.0,3.25,2.5,0.0,yellow_tripdata_2025-05.parquet
4,2,2025-05-26 18:16:42+00:00,2025-05-26 18:16:59+00:00,1.0,0.01,1.0,N,255,255,2,3.0,0.0,0.5,0.0,0.0,1.0,4.5,0.0,0.0,yellow_tripdata_2025-05.parquet


In [8]:
trips_ml_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3188447 entries, 0 to 3188446
Data columns (total 20 columns):
 #   Column                 Dtype              
---  ------                 -----              
 0   VendorID               Int64              
 1   tpep_pickup_datetime   datetime64[us, UTC]
 2   tpep_dropoff_datetime  datetime64[us, UTC]
 3   passenger_count        float64            
 4   trip_distance          float64            
 5   RatecodeID             float64            
 6   store_and_fwd_flag     object             
 7   PULocationID           Int64              
 8   DOLocationID           Int64              
 9   payment_type           Int64              
 10  fare_amount            float64            
 11  extra                  float64            
 12  mta_tax                float64            
 13  tip_amount             float64            
 14  tolls_amount           float64            
 15  improvement_surcharge  float64            
 16  total_amount      

In [9]:
# Missing values
trips_ml_data_df.isna().sum()

Unnamed: 0,0
VendorID,0
tpep_pickup_datetime,0
tpep_dropoff_datetime,0
passenger_count,0
trip_distance,0
RatecodeID,0
store_and_fwd_flag,0
PULocationID,0
DOLocationID,0
payment_type,0


In [10]:
def preprocess_data(df):
    # Ensure datetime columns are in datetime format
    #df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    #df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

    # Trip duration in minutes
    df["trip_duration"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60

    # Extract time-based features
    df["pickup_dayofweek"] = df["tpep_pickup_datetime"].dt.dayofweek # Monday=0, Sunday=6.
    df["pickup_month"] = df["tpep_pickup_datetime"].dt.month
    df["pickup_year"] = df["tpep_pickup_datetime"].dt.year
    df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
    df["is_weekend"] = df["pickup_dayofweek"].isin([5, 6]).astype(int)  # 5=Saturday, 6=Sunday

    # Filter payment type (Credit Card = 1, Cash = 2)
    #df = df[df["payment_type"].isin([1, 2])].copy()

    # Create binary feature for credit card payments
    df["is_credit_card"] = (df["payment_type"] == 1).astype(int)

    # Select relevant columns
    selected_cols = [
        "PULocationID", "DOLocationID", "passenger_count", "trip_distance",
        "trip_duration", "pickup_dayofweek", "pickup_month", "pickup_year", "pickup_hour",
        "is_weekend", "is_credit_card", "total_amount"
    ]

    return df[selected_cols].copy()


In [11]:
from sklearn.model_selection import train_test_split

def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Splits the dataframe into train, validation, and test sets.

    Parameters:
    - df: Pandas DataFrame
    - train_size: Proportion of the dataset for training (default=70%)
    - val_size: Proportion for validation (default=15%)
    - test_size: Proportion for testing (default=15%)
    - random_state: Seed for reproducibility

    Returns:
    - train_df, val_df, test_df: Split DataFrames
    """
    assert train_size + val_size + test_size == 1, "Split sizes must sum to 1"

    # First, split train + val and test
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    # Then, split train and validation
    train_df, val_df = train_test_split(train_val_df, test_size=val_size / (train_size + val_size),
                                        random_state=random_state)

    return train_df, val_df, test_df

# Apply the function
train_df, val_df, test_df = split_data(trips_ml_data_df)

# Display the sizes
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


Train size: 2231912
Validation size: 478267
Test size: 478268


In [12]:
preprocessed_train_df = preprocess_data(train_df)
preprocessed_train_df.head()

Unnamed: 0,PULocationID,DOLocationID,passenger_count,trip_distance,trip_duration,pickup_dayofweek,pickup_month,pickup_year,pickup_hour,is_weekend,is_credit_card,total_amount
1800231,162,211,1.0,2.7,27.183333,3,5,2025,13,0,1,34.5
1755152,90,236,1.0,3.95,34.633333,5,5,2025,12,1,1,41.45
1872054,230,236,1.0,2.65,30.733333,2,5,2025,18,0,1,48.35
1823522,100,255,1.0,4.8,22.116667,5,5,2025,9,1,1,37.09
2186313,236,164,1.0,3.07,25.216667,0,5,2025,17,0,1,36.66


In [13]:
preprocessed_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2231912 entries, 1800231 to 1276627
Data columns (total 12 columns):
 #   Column            Dtype  
---  ------            -----  
 0   PULocationID      Int64  
 1   DOLocationID      Int64  
 2   passenger_count   float64
 3   trip_distance     float64
 4   trip_duration     float64
 5   pickup_dayofweek  int32  
 6   pickup_month      int32  
 7   pickup_year       int32  
 8   pickup_hour       int32  
 9   is_weekend        int64  
 10  is_credit_card    int64  
 11  total_amount      float64
dtypes: Int64(2), float64(4), int32(4), int64(2)
memory usage: 191.6 MB


In [14]:
# Load the preprocessed_train_df dataframe into BigQuery

DATASET_ID = "ml_dataset"
TABLE_ID = "preprocessed_train_data"
FULL_TABLE_ID = f"{project}.{DATASET_ID}.{TABLE_ID}"

# Define schema (ensure correct types)
schema = [
    bigquery.SchemaField("PULocationID", "INTEGER"),
    bigquery.SchemaField("DOLocationID", "INTEGER"),
    bigquery.SchemaField("passenger_count", "FLOAT"),
    bigquery.SchemaField("trip_distance", "FLOAT"),
    bigquery.SchemaField("trip_duration", "FLOAT"),
    bigquery.SchemaField("pickup_dayofweek", "INTEGER"),
    bigquery.SchemaField("pickup_month", "INTEGER"),
    bigquery.SchemaField("pickup_year", "INTEGER"),
    bigquery.SchemaField("pickup_hour", "INTEGER"),
    bigquery.SchemaField("is_weekend", "INTEGER"),
    bigquery.SchemaField("is_credit_card", "INTEGER"),
    bigquery.SchemaField("total_amount", "FLOAT"),
]

# Load data into BigQuery
job = client.load_table_from_dataframe(
    preprocessed_train_df, FULL_TABLE_ID, job_config=bigquery.LoadJobConfig(schema=schema)
)

# Wait for the job to complete
job.result()

print(f"Data successfully uploaded to BigQuery: {FULL_TABLE_ID}")

Data successfully uploaded to BigQuery: possible-coast-463614-i9.ml_dataset.preprocessed_train_data


In [15]:
preprocessed_test_df = preprocess_data(test_df)
preprocessed_test_df.head()

Unnamed: 0,PULocationID,DOLocationID,passenger_count,trip_distance,trip_duration,pickup_dayofweek,pickup_month,pickup_year,pickup_hour,is_weekend,is_credit_card,total_amount
3132906,87,162,1.0,4.79,12.75,2,5,2025,22,0,1,34.56
1413930,114,79,3.0,0.39,2.483333,5,5,2025,3,1,1,10.16
689799,162,230,1.0,0.72,10.016667,4,5,2025,15,0,1,18.44
1033112,137,162,1.0,0.97,2.633333,5,5,2025,5,1,1,12.55
1834491,142,43,3.0,2.51,17.35,6,5,2025,18,1,1,25.2


In [16]:
preprocessed_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478268 entries, 3132906 to 1166832
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   PULocationID      478268 non-null  Int64  
 1   DOLocationID      478268 non-null  Int64  
 2   passenger_count   478268 non-null  float64
 3   trip_distance     478268 non-null  float64
 4   trip_duration     478268 non-null  float64
 5   pickup_dayofweek  478268 non-null  int32  
 6   pickup_month      478268 non-null  int32  
 7   pickup_year       478268 non-null  int32  
 8   pickup_hour       478268 non-null  int32  
 9   is_weekend        478268 non-null  int64  
 10  is_credit_card    478268 non-null  int64  
 11  total_amount      478268 non-null  float64
dtypes: Int64(2), float64(4), int32(4), int64(2)
memory usage: 41.1 MB


In [17]:
preprocessed_test_df.shape

(478268, 12)

In [18]:
# Load the preprocessed_test_df dataframe into BigQuery

DATASET_ID = "ml_dataset"
TABLE_ID = "preprocessed_test_data"
FULL_TABLE_ID = f"{project}.{DATASET_ID}.{TABLE_ID}"

# Define schema (ensure correct types)
schema = [
    bigquery.SchemaField("PULocationID", "INTEGER"),
    bigquery.SchemaField("DOLocationID", "INTEGER"),
    bigquery.SchemaField("passenger_count", "FLOAT"),
    bigquery.SchemaField("trip_distance", "FLOAT"),
    bigquery.SchemaField("trip_duration", "FLOAT"),
    bigquery.SchemaField("pickup_dayofweek", "INTEGER"),
    bigquery.SchemaField("pickup_month", "INTEGER"),
    bigquery.SchemaField("pickup_year", "INTEGER"),
    bigquery.SchemaField("pickup_hour", "INTEGER"),
    bigquery.SchemaField("is_weekend", "INTEGER"),
    bigquery.SchemaField("is_credit_card", "INTEGER"),
    bigquery.SchemaField("total_amount", "FLOAT"),
]

# Load data into BigQuery
job = client.load_table_from_dataframe(
    preprocessed_test_df, FULL_TABLE_ID, job_config=bigquery.LoadJobConfig(schema=schema)
)

# Wait for the job to complete
job.result()

print(f"Data successfully uploaded to BigQuery: {FULL_TABLE_ID}")

Data successfully uploaded to BigQuery: possible-coast-463614-i9.ml_dataset.preprocessed_test_data


In [None]:
# You can continue to create a custom model

