# NYC Taxi Fare Prediction
**Final Project for Introduction to Data Science**

---

### **Authors**
- Joshua Arroyo
- Samuel Trejo

---

### **Objective**
This project aims to predict NYC taxi fares using machine learning techniques, focusing on data preprocessing, exploratory analysis, and building a neural network model to make accurate predictions.


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# Specify the file path (Change bc cant upload data to github bc over 2gb)
file_path = '/content/drive/My Drive/train.csv'

# Read the CSV file
train_data = pd.read_csv(file_path)

# Display the dataframe
print(train_data.head())

In [None]:
# Create function to validate pickup/dropoff within NYC
def filter_rides_within_nyc(data):

    # NYC long/lat range
    nyc_lat_min = 40.4774
    nyc_lat_max = 40.9176
    nyc_lon_min = -74.2591
    nyc_lon_max = -73.7004

    # Ensure data btw range
    valid_data = data[
        (data['pickup_latitude'].between(nyc_lat_min, nyc_lat_max)) &
        (data['pickup_longitude'].between(nyc_lon_min, nyc_lon_max)) &
        (data['dropoff_latitude'].between(nyc_lat_min, nyc_lat_max)) &
        (data['dropoff_longitude'].between(nyc_lon_min, nyc_lon_max))
    ]

    return valid_data

# Clean null values
train_data = train_data.dropna()

# Remove invalid pickup/dropoff
train_data = filter_rides_within_nyc(train_data)

# Extract features from pickup_datetime, such as hour, day, weekday, and month
train_data['pickup_datetime'] = pd.to_datetime(train_data['pickup_datetime'])       ## Format time data to standard (datetime format)
train_data['pickup_hour'] = train_data['pickup_datetime'].dt.hour
train_data['pickup_day'] = train_data['pickup_datetime'].dt.day
train_data['pickup_weekday'] = train_data['pickup_datetime'].dt.weekday
train_data['pickup_month'] = train_data['pickup_datetime'].dt.month


In [None]:
#Calculate the distance(km) between pickup and drop-off locations using the Haversine formula
def haversine_distance(data):

    # Earth radius in km
    R = 6371.0

    # Convert lat & long to radians
    pickup_lat = np.radians(data['pickup_latitude'])
    pickup_lon = np.radians(data['pickup_longitude'])
    dropoff_lat = np.radians(data['dropoff_latitude'])
    dropoff_lon = np.radians(data['dropoff_longitude'])

    # Calculate the differences between latitudes and longitudes
    delta_lat = dropoff_lat - pickup_lat
    delta_lon = dropoff_lon - pickup_lon

    # Calc using Haversine formula
    a = np.sin(delta_lat / 2)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(delta_lon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Calculate the distance
    data['distance_km'] = R * c

    return data

# Add distance column to data
train_data = haversine_distance(train_data)

In [None]:
# Combine pickup and dropoff coordinates into a single DataFrame with two columns
coords = pd.concat([
    train_data[['pickup_latitude', 'pickup_longitude']].rename(columns={'pickup_latitude': 'latitude', 'pickup_longitude': 'longitude'}),
    train_data[['dropoff_latitude', 'dropoff_longitude']].rename(columns={'dropoff_latitude': 'latitude', 'dropoff_longitude': 'longitude'})
], ignore_index=True)

In [None]:
# Import MiniBatchKMeans
from sklearn.cluster import MiniBatchKMeans

# Define the number of clusters
n_clusters = 60

# Initialize MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10000)

# Fit the model to the combined coordinates
kmeans.fit(coords)

# Get cluster labels
cluster_labels = kmeans.labels_

# Split the labels back into pickup and dropoff labels
n_samples = len(train_data)
train_data['pickup_cluster'] = cluster_labels[:n_samples]
train_data['dropoff_cluster'] = cluster_labels[n_samples:]

# Now, 'train_data' has the cluster labels added
print(train_data.head())