In [None]:
# Automatically reload our project Python modules when we run the notebook
%load_ext autoreload
%autoreload 2

In [6]:
import pandas as pd
import os
import glob

# Combine all daily files into a single DataFrame and save as taxi-rides.parquet
all_files = sorted(glob.glob("../data/labeled/2025-01-*.taxi-rides.parquet"))
combined_df = pd.concat([pd.read_parquet(f) for f in all_files], ignore_index=True)
combined_df.to_parquet("../data/labeled/taxi-rides.parquet", index=False)

In [7]:
from sklearn.model_selection import train_test_split

# Keep only 20% of the data, stratified by 'outlier'
sampled_df, _ = train_test_split(
    combined_df,
    test_size=0.8,
    stratify=combined_df['outlier'],
    random_state=42
)

sampled_df.reset_index(drop=True, inplace=True)
sampled_df['outlier'].value_counts()
sampled_df.to_parquet("../data/labeled/taxi-rides.parquet", index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Load the labelled data
df = pd.read_parquet("../data/labeled/taxi-rides.parquet")

# Features and target
X = df[['ride_time', 'trip_distance']]
y = df['outlier']

# Split data for training and testing
# As the dataset is imbalanced, stratify=y will ensure that the split maintains the proportion of classes
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Use class_weight='balanced' to handle class imbalance
clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

Confusion Matrix:
[[133402     15]
 [    21     46]]

Classification Report:
              precision    recall  f1-score   support

       False     0.9998    0.9999    0.9999    133417
        True     0.7541    0.6866    0.7188        67

    accuracy                         0.9997    133484
   macro avg     0.8770    0.8432    0.8593    133484
weighted avg     0.9997    0.9997    0.9997    133484



In [9]:
# Find outliers for full data of 1 day
jan1_df = pd.read_parquet("../work1-labelled/2025-01-01.taxi-rides.parquet")

# Predict using the trained classifier
jan1_pred = clf.predict(jan1_df[['ride_time', 'trip_distance']])

# Add predictions to the dataframe
jan1_df['predicted_outlier'] = jan1_pred

# Display all predicted outliers
jan1_df[jan1_df['predicted_outlier'] == True]

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,ride_time,outlier,predicted_outlier
3973,2025-01-01 00:27:08,2025-01-02 00:00:00,5.72,84772.0,False,True
6105,2025-01-01 01:42:12,2025-01-01 01:46:07,0.6,235.0,False,True
9197,2025-01-01 01:04:01,2025-01-01 03:33:22,133.3,8961.0,True,True
10732,2025-01-01 01:45:26,2025-01-01 11:29:42,0.56,35056.0,True,True
14703,2025-01-01 02:35:58,2025-01-02 00:00:00,4.2,77042.0,True,True
15510,2025-01-01 02:26:10,2025-01-01 16:11:00,9.36,49490.0,True,True
20003,2025-01-01 04:34:27,2025-01-01 04:38:22,0.6,235.0,False,True
20005,2025-01-01 03:58:04,2025-01-02 03:00:30,21.2,82946.0,True,True
23787,2025-01-01 06:19:01,2025-01-01 13:47:00,10.92,26879.0,True,True
23912,2025-01-01 07:41:21,2025-01-01 10:05:39,116.9,8658.0,True,True


In [None]:
# Store the model, reload it from the file and predict again
import pickle

# Store the trained model using pickle
with open("taxi-ride-outlier-detector.model.pkl", "wb") as f:
    pickle.dump(clf, f)

# Load the model
with open("taxi-ride-outlier-detector.model.pkl", "rb") as f:
    loaded_clf = pickle.load(f)

# Predict using the loaded model
jan1_pred_loaded = loaded_clf.predict(jan1_df[['ride_time', 'trip_distance']])

# Add predictions to the dataframe
jan1_df['predicted_outlier_loaded'] = jan1_pred_loaded

# Display all predicted outliers from the loaded model
jan1_df[jan1_df['predicted_outlier_loaded'] == True]

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,ride_time,outlier,predicted_outlier,predicted_outlier_loaded
3973,2025-01-01 00:27:08,2025-01-02 00:00:00,5.72,84772.0,False,True,True
6105,2025-01-01 01:42:12,2025-01-01 01:46:07,0.6,235.0,False,True,True
9197,2025-01-01 01:04:01,2025-01-01 03:33:22,133.3,8961.0,True,True,True
10732,2025-01-01 01:45:26,2025-01-01 11:29:42,0.56,35056.0,True,True,True
14703,2025-01-01 02:35:58,2025-01-02 00:00:00,4.2,77042.0,True,True,True
15510,2025-01-01 02:26:10,2025-01-01 16:11:00,9.36,49490.0,True,True,True
20003,2025-01-01 04:34:27,2025-01-01 04:38:22,0.6,235.0,False,True,True
20005,2025-01-01 03:58:04,2025-01-02 03:00:30,21.2,82946.0,True,True,True
23787,2025-01-01 06:19:01,2025-01-01 13:47:00,10.92,26879.0,True,True,True
23912,2025-01-01 07:41:21,2025-01-01 10:05:39,116.9,8658.0,True,True,True
