In [3]:
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client
import time

# Define the file paths to the datasets
trips_by_distance_path = '/content/Trips_by_Distance.csv'
trips_full_data_path = '/content/Trips_Full Data.csv'

# Define a function for serial computation using pandas
def compute_serial(file_path):
    # Read the data using pandas
    df = pd.read_csv(file_path)
    # Calculate the total distance by summing the relevant columns
    # Adjust the column names according to your dataset
    df['Total Distance'] = df['Trips <1 Mile'] + df['Trips 1-3 Miles'] + df['Trips 3-5 Miles'] + df['Trips 5-10 Miles']
    # Calculate the average distance
    return df['Total Distance'].mean()

# Define a function for parallel computation using Dask
def compute_parallel(file_path, n_workers):
    # Setup Dask client with specified number of workers
    client = Client(n_workers=n_workers)
    # Read the data using Dask
    df = dd.read_csv(file_path)
    # Perform the same calculation as above, in parallel
    df['Total Distance'] = df['Trips <1 Mile'] + df['Trips 1-3 Miles'] + df['Trips 3-5 Miles'] + df['Trips 5-10 Miles']
    avg_distance = df['Total Distance'].mean().compute()
    # Close the Dask client
    client.close()
    return avg_distance

# Perform serial computation and measure the time taken
start_time = time.time()
avg_distance_serial = compute_serial(trips_full_data_path)
end_time = time.time()
print(f"Average distance traveled (serial): {avg_distance_serial}")
print(f"Time taken (serial): {end_time - start_time:.4f} seconds")

# Perform parallel computations with different number of processors and measure the time taken
processor_counts = [10, 20]
times_taken = {}

for count in processor_counts:
    start_time = time.time()
    avg_distance_parallel = compute_parallel(trips_full_data_path, count)
    end_time = time.time()
    times_taken[count] = end_time - start_time
    print(f"Time taken with {count} processors: {times_taken[count]:.4f} seconds")

# Output the results
print("\nSerial processing time:", end_time - start_time)
for count, time_taken in times_taken.items():
    print(f"Parallel processing time with {count} processors: {time_taken:.4f} seconds")


Average distance traveled (serial): 1110453316.857143
Time taken (serial): 0.0090 seconds


INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:35915
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:43145'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:34443'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:35533'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:46787'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:43225'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:33275'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:43011'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:37797'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:42873'
INFO:distributed.nanny:      

Time taken with 10 processors: 21.1884 seconds


INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:43931'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:43979'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:34343'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:36669'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:34433'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:41675'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:37075'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:44339'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:42707'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:35795'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:35849'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:42455'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:37491'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:41015'
INFO:d

Time taken with 20 processors: 39.0907 seconds

Serial processing time: 39.09074783325195
Parallel processing time with 10 processors: 21.1884 seconds
Parallel processing time with 20 processors: 39.0907 seconds
