In [1]:
import warnings
warnings.filterwarnings("ignore")

# Data Loading and Ploting

In [2]:
DataSet_path = "/teamspace/s3_folders/CursusDataSet/CityLife"

## Loading Rush Hours

In [None]:
import os
import pandas as pd

rush_hours_df = pd.read_csv(os.path.join(DataSet_path, "rush_hours_empty.csv"))
rush_hours_df

## Loading Taxi Locations

In [None]:
taxi_locations_df = pd.read_csv(os.path.join(DataSet_path, "taxi_locations.csv"))
taxi_locations_df

In [None]:
taxi_locations_df.info()

## Editing, Extracting DataTime Data

In [None]:
# Extracting hour, day of week, and month for temporal analysis
taxi_locations_df['Trip Start Timestamp'] = pd.to_datetime(taxi_locations_df['Trip Start Timestamp'], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')
taxi_locations_df['Trip End Timestamp'] = pd.to_datetime(taxi_locations_df['Trip End Timestamp'], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')
taxi_locations_df['Trip Start Hour'] = taxi_locations_df['Trip Start Timestamp'].dt.hour
taxi_locations_df['Trip Start Day'] = taxi_locations_df['Trip Start Timestamp'].dt.day_name()
taxi_locations_df['Trip Start Month'] = taxi_locations_df['Trip Start Timestamp'].dt.month_name()

# Count of trips by hour
hourly_trip_counts = taxi_locations_df['Trip Start Hour'].value_counts().sort_index()

# Count of trips by day of the week
daily_trip_counts = taxi_locations_df['Trip Start Day'].value_counts().reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

# Count of trips by month
monthly_trip_counts = taxi_locations_df['Trip Start Month'].value_counts().sort_index()

taxi_locations_df

## Ploting Trips Number By DataTime Type

In [7]:
%%capture

%pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("dark")
colors = sns.color_palette("Set2", 8)

# Create a single figure for all three plots
fig, axes = plt.subplots(3, 1, figsize=(15, 20), gridspec_kw={'hspace': 0.4})

# Plotting Number of Trips by Hour
sns.barplot(x=hourly_trip_counts.index, y=hourly_trip_counts.values, ax=axes[0], palette=colors)
axes[0].set_title('Number of Trips by Hour of the Day', fontsize=16)
axes[0].set_xlabel('Hour of the Day', fontsize=12)
axes[0].set_ylabel('Number of Trips', fontsize=12)
axes[0].tick_params(axis='x', rotation=0)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the peaks
for i, value in enumerate(hourly_trip_counts.values):
    axes[0].text(i, value + 5, f'{value}', ha='center', fontsize=10, color='black')

# Plotting Number of Trips by Day of the Week
sns.barplot(x=daily_trip_counts.index, y=daily_trip_counts.values, ax=axes[1], palette=colors)
axes[1].set_title('Number of Trips by Day of the Week', fontsize=16)
axes[1].set_xlabel('Day of the Week', fontsize=12)
axes[1].set_ylabel('Number of Trips', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the peaks
for i, value in enumerate(daily_trip_counts.values):
    axes[1].text(i, value + 5, f'{value}', ha='center', fontsize=10, color='black')

# Plotting Number of Trips by Month
sns.barplot(x=monthly_trip_counts.index, y=monthly_trip_counts.values, ax=axes[2], palette=colors)
axes[2].set_title('Number of Trips by Month', fontsize=16)
axes[2].set_xlabel('Month', fontsize=12)
axes[2].set_ylabel('Number of Trips', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the peaks
for i, value in enumerate(monthly_trip_counts.values):
    axes[2].text(i, value + 5, f'{value}', ha='center', fontsize=10, color='black')

# Display the plots
plt.show()

# Most popular areas

## Loading PickPoints and DropOffs

In [9]:
pickpoints_df = taxi_locations_df[["Pickup Centroid Latitude", "Pickup Centroid Longitude"]].dropna()
drop_offs_df = taxi_locations_df[["Dropoff Centroid Latitude", "Dropoff Centroid Longitude"]].dropna()
pickpoints_df.reset_index(inplace=True)
drop_offs_df.reset_index(inplace=True)

## Clustering Analysis Of Pick Point And Drop Of

- Conduct clustering analysis of pick point and drop off locations based on their
coordinates. Clusters might be different for each of the categories (pickpoints and
drop-offs).

```
FAISS KMeans Choice Argument:
Because it’s fast, GPU-capable, and ideal for huge datasets.
```

In [6]:
%%capture

%conda install -y -c pytorch faiss-gpu=1.7.2

In [None]:
import faiss

use_gpu = faiss.get_num_gpus() > 0
print(f"Using GPU: {use_gpu}")