In [None]:
#Data Cleaning

import dask.dataframe as dd
import pandas as pd
data1 = pd.read_csv("trips_by_distance.csv")
data2= pd.read_csv("trips_full_data.csv")

# Fill null values according to data types
for column, dtype in new_data1_types.items(): #iterate over columns and their datatypes
    if dtype == 'Int64':
        data1[column] = data1[column].fillna(0)  #if the data type is integer Fill with 0
    elif dtype == 'object':
        data1[column] = data1[column].fillna('NULL')  #if the data type is object Fill with 'NULL'

# Fill null values according to data types
for column, dtype in new_data2_types.items(): #iterate over columns and their datatypes
    if dtype == 'Int64':
        data2[column] = data2[column].fillna(0)  #if the data type is integer Fill with 0
    elif dtype == 'object':
        data2[column] = data2[column].fillna('NULL')  #if the data type is object Fill with 'NULL'

In [None]:
#A

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import time

# Record start time
start_time = time.time()

data1 = pd.read_csv("trips_by_distance.csv")
data2 = pd.read_csv("trips_full_data.csv")

#A1
# Calculate the average number of people staying at home per week
# Fill null values with 0/NaN (depending on what you want)
data1['Population Staying at Home'] = data1['Population Staying at Home'].fillna(0)
# Round floats to ints
data1['Population Staying at Home'] = data1['Population Staying at Home'].round().astype('int64')
average_per_week = data1.groupby('Week')['Population Staying at Home'].mean().round()

# Plotting the data
plt.figure(figsize=(10, 6))
plt.bar(average_per_week.index, average_per_week.values, color='purple', width=0.8)
plt.title('Average Number of People Staying at Home per Week', fontsize=16, fontweight='bold', color='black')
plt.xlabel('Number of Weeks', fontsize=14, fontweight='bold', color='black')
plt.ylabel('Average Number of People', fontsize=14, fontweight='bold', color='black')
plt.xticks(rotation=0, fontsize=12, color='black')  
plt.yticks(fontsize=12, color='black')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
plt.tight_layout()
plt.show()


#A2

# Define the order of distance ranges
distance_ranges = [
     'Trips <1 Mile', 'Trips 1-3 Miles', 'Trips 3-5 Miles', 'Trips 5-10 Miles', 'Trips 10-25 Miles', 
                   'Trips 25-50 Miles', 'Trips 50-100 Miles', 'Trips 100-250 Miles', 
                   'Trips 500+ Miles'
]
# Calculate the total number of people not staying at home per week
total_people_not_staying_home = data2['People Not Staying at Home'].mean()
# Calculate the total distance traveled for each type of trip
total_distances = {}
for distance_range in distance_ranges:
    total_distance = (data2[distance_range] * data2['People Not Staying at Home']).mean()
    total_distances[distance_range] = total_distance
# Create a DataFrame for plotting
plot_data = pd.DataFrame({
    'Distance Range (Miles)': distance_ranges,
    'Total Distance Traveled': [total_distances[distance_range] for distance_range in distance_ranges]
})

# Plotting with Seaborn
plt.figure(figsize=(12, 6))
sns.barplot(x='Distance Range (Miles)', y='Total Distance Traveled', data=plot_data, palette='viridis')
plt.title('Total Distance Traveled for Each Type of Trip\n(Weighted by Number of People)', fontsize=16, fontweight='bold', color='black')
plt.xlabel('Distance Range (Miles)', fontsize=14, fontweight='bold', color='black')
plt.ylabel('Total Distance Traveled ', fontsize=14, fontweight='bold', color='black')
plt.xticks(rotation=45, ha='right',fontsize=12, color='black')  # Rotate x-axis labels for better readability
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# Record end time
end_time = time.time()
# Calculate and print the time taken
print("Time taken for serial processing to execute the code:", end_time - start_time, "seconds")

In [None]:
#B

import pandas as pd
import plotly.express as px
import time
data1 = pd.read_csv("trips_by_distance.csv")


# Record start time
start_time = time.time()

# Filter dates where more than 10,000,000 people conducted 10-25 trips
data1_10_25 = data1[data1['Number of Trips 10-25'] > 10000000]

# Filter dates where more than 10,000,000 people conducted 50-100 trips
data1_50_100 = data1[data1['Number of Trips 50-100'] > 10000000]

# Create scatter plot for 10-25 trips
fig_10_25 = px.scatter(data1_10_25, x='Date', y='Number of Trips 10-25',
                       labels={'x': 'Date', 'y': 'Number of Trips 10-25'},
                       title='Dates with >10,000,000 people conducting 10-25 Trips')

# Create scatter plot for 50-100 trips
fig_50_100 = px.scatter(data1_50_100, x='Date', y='Number of Trips 50-100',
                        labels={'x': 'Date', 'y': 'Number of Trips 50-100'},
                        title='Dates with >10,000,000 people conducting 50-100 Trips')

# Customizing appearance
fig_10_25.update_traces(marker=dict(size=8, color='limegreen', line=dict(width=1, color='black')))
fig_50_100.update_traces(marker=dict(size=8, color='coral', line=dict(width=1, color='black')))

# Format x-axis
fig_10_25.update_layout(xaxis=dict(tickformat="%Y-%m-%d", tickmode='auto', nticks=13))
fig_50_100.update_layout(xaxis=dict(tickangle=0, tickformat="%Y-%m-%d", tickmode='auto', nticks=10))

# Show the plots
fig_10_25.show()
fig_50_100.show()

# Record end time
end_time = time.time()

# Calculate and print the time taken
print("Time taken for serial processing to execute the code:", end_time - start_time, "seconds")

In [None]:
#C for a

from dask.distributed import Client, progress
import time
import matplotlib.pyplot as plt
import dask.dataframe as dd
import pandas as pd
import seaborn as sns

n_processors = [10, 20]
n_processors_time = {}

for processor in n_processors:
    print(f"\n\n\nStarting computation with {processor} processors...\n\n\n")
    client = Client(n_workers=processor)

    # Question (a1)
    start_time = time.time()
    data1 = pd.read_csv("trips_by_distance.csv")
    data2 = pd.read_csv("trips_full_data.csv")
    # Calculate the average number of people staying at home per week
    data1['Population Staying at Home'] = data1['Population Staying at Home'].fillna(0)
    data1['Population Staying at Home'] = data1['Population Staying at Home'].round().astype('int64')
    average_per_week = data1.groupby('Week')['Population Staying at Home'].mean().round()

    # Plot the data
    plt.figure(figsize=(12, 6))
    plt.bar(average_per_week.index, average_per_week.values, color='purple', width=0.8)
    plt.title('Average Number of People Staying at Home per Week', fontsize=16, fontweight='bold', color='black')
    plt.xlabel('Number of Weeks', fontsize=14, fontweight='bold', color='black')
    plt.ylabel('Average Number of People', fontsize=14, fontweight='bold', color='black')
    plt.xticks(rotation=0, fontsize=12, color='black')
    plt.yticks(fontsize=12, color='black')
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
    plt.show()
    end_time = time.time()
    
    # Question (a2)
    start_time = time.time()
    distance_ranges = [
        'Trips <1 Mile', 'Trips 1-3 Miles', 'Trips 3-5 Miles', 'Trips 5-10 Miles', 'Trips 10-25 Miles',
        'Trips 1-25 Miles', 'Trips 25-50 Miles', 'Trips 50-100 Miles', 'Trips 25-100 Miles',
        'Trips 100-250 Miles', 'Trips 100+ Miles', 'Trips 250-500 Miles', 'Trips 500+ Miles'
    ]
    total_people_not_staying_home = data2['People Not Staying at Home'].mean()
    total_distances = {}
    for distance_range in distance_ranges:
        total_distance = (data2[distance_range] * data2['People Not Staying at Home']).mean()
        total_distances[distance_range] = total_distance

    plot_data = pd.DataFrame({
        'Distance Range (Miles)': distance_ranges,
        'Total Distance Traveled': [total_distances[distance_range] for distance_range in distance_ranges]
    })

    plt.figure(figsize=(12, 6))
    sns.barplot(x='Distance Range (Miles)', y='Total Distance Traveled', data=plot_data, palette='viridis')
    plt.title('Total Distance Traveled for Each Type of Trip\n(Weighted by Number of People)', fontsize=16, fontweight='bold', color='black')
    plt.xlabel('Distance Range (Miles)', fontsize=14, fontweight='bold', color='black')
    plt.ylabel('Total Distance Traveled ', fontsize=14, fontweight='bold', color='black')
    plt.xticks(rotation=45, ha='right', fontsize=12, color='black')
    plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()
    end_time = time.time()
    
    dask_time = time.time() - start_time
    n_processors_time[processor] = dask_time
    print(f"\n\n\nTime Taken with {processor} processors: {dask_time} seconds\n\n\n")
    client.close()
    
# Print computation times
print("\n\n\n")
print("10 Processor:", n_processors_time[10], "seconds\n20 Processor:", n_processors_time[20], "seconds")
print("\n\n\n")

In [None]:
#C for b

import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
from dask.distributed import Client, progress
import plotly.express as px
import time

# Define number of processors
n_processors = [10, 20]
n_processors_time = {}

# Read data using Pandas
data1 = pd.read_csv("Trips_By_Distance.csv")
# Define function to perform computation with different number of processors
def perform_computation(processor):
    print(f"\n\n\nStarting computation with {processor} processors...\n\n\n")
    client = Client(n_workers=processor)
    start = time.time()
    
    # Filter dates where more than 10,000,000 people conducted 10-25 trips
    data1_10_25 = data1[data1['Number of Trips 10-25'] > 10000000]

    # Filter dates where more than 10,000,000 people conducted 50-100 trips
    data1_50_100 = data1[data1['Number of Trips 50-100'] > 10000000]

    # Create scatter plot for 10-25 trips
    fig_10_25 = px.scatter(data1_10_25, x='Date', y='Number of Trips 10-25',
                           labels={'x': 'Date', 'y': 'Number of Trips 10-25'},
                           title='Dates with >10,000,000 people conducting 10-25 Trips')

    # Create scatter plot for 50-100 trips
    fig_50_100 = px.scatter(data1_50_100, x='Date', y='Number of Trips 50-100',
                            labels={'x': 'Date', 'y': 'Number of Trips 50-100'},
                            title='Dates with >10,000,000 people conducting 50-100 Trips')

    # Customizing appearance
    fig_10_25.update_traces(marker=dict(size=8, color='limegreen', line=dict(width=1, color='black')))
    fig_50_100.update_traces(marker=dict(size=8, color='coral', line=dict(width=1, color='black')))

    # Format x-axis
    fig_10_25.update_layout(xaxis=dict(tickformat="%Y-%m-%d", tickmode='auto', nticks=13))
    fig_50_100.update_layout(xaxis=dict(tickangle=0, tickformat="%Y-%m-%d", tickmode='auto', nticks=10))

    # Show the plots
    fig_10_25.show()
    fig_50_100.show()

    end = time.time()
    computation_time = end - start

    print(f"\n\n\nTime taken with {processor} processors: {computation_time} seconds\n\n\n")

    # Close the client after computation
    client.close()

    # Store computation time
    n_processors_time[processor] = computation_time


# Perform computation with different number of processors
for processor in n_processors:
    perform_computation(processor)

# Print computation times
print("\n\n\n")
print("10 Processor:", n_processors_time[10], "seconds\n20 Processor:", n_processors_time[20], "seconds")
print("\n\n\n")


In [None]:
#D

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm

# Function to load and preprocess data
def load_and_preprocess_data(file_path, date_col, week_col, target_week):
    data_frame = pd.read_csv(file_path)
    data_frame[date_col] = pd.to_datetime(data_frame[date_col], errors='coerce')
    filtered_data = data_frame[data_frame[week_col] == target_week]
    return filtered_data

# Function to fit and evaluate models
def fit_and_evaluate_models(features_train, features_test, targets_train, targets_test):
    regression_models = {}

    # Linear Regression
    linear_model = LinearRegression()
    linear_model.fit(features_train, targets_train)
    regression_models['linear'] = {
        'model': linear_model,
        'predictions': {
            'train': linear_model.predict(features_train),
            'test': linear_model.predict(features_test)
        },
        'score_train': linear_model.score(features_train, targets_train),
        'score_test': linear_model.score(features_test, targets_test)
    }

    # Polynomial Regression
    polynomial_transformer = PolynomialFeatures(degree=2, include_bias=False)
    features_train_poly = polynomial_transformer.fit_transform(features_train)
    features_test_poly = polynomial_transformer.transform(features_test)
    polynomial_model = LinearRegression()
    polynomial_model.fit(features_train_poly, targets_train)
    regression_models['polynomial'] = {
        'model': polynomial_model,
        'transformer': polynomial_transformer,
        'predictions': {
            'train': polynomial_model.predict(features_train_poly),
            'test': polynomial_model.predict(features_test_poly)
        },
        'score_train': polynomial_model.score(features_train_poly, targets_train),
        'score_test': polynomial_model.score(features_test_poly, targets_test)
    }

    # OLS model
    features_train_ols = sm.add_constant(features_train)
    ols_model = sm.OLS(targets_train, features_train_ols).fit()
    features_test_ols = sm.add_constant(features_test)
    regression_models['OLS'] = {
        'model': ols_model,
        'predictions': {
            'train': ols_model.predict(features_train_ols),
            'test': ols_model.predict(features_test_ols)
        },
        'summary': ols_model.summary()
    }

    return regression_models

# Function to create summary table
def create_summary_table(models):
    summary_data = {
        "Model": ["Linear Regression", "Polynomial Regression"],
        "R-squared (Training)": [models['linear']['score_train'], models['polynomial']['score_train']]
        
    }
    summary_df = pd.DataFrame(summary_data)
    return summary_df

# Function to plot results
def plot_regression_results(features_train, targets_train, model, transformer, model_label, plot_title):
    plt.figure(figsize=(10, 6))
    plt.scatter(features_train, targets_train, color='darkred', label='Actual data')
    plt.xlabel('Trips 1-3 Miles', fontsize=12, fontweight='bold')
    plt.ylabel('Number of Trips >=500', fontsize=12, fontweight='bold')
    plt.title(plot_title, fontsize=14, fontweight='bold')

    if model_label == 'Polynomial':
        feature_range = np.linspace(features_train.min(), features_train.max(), 300).reshape(-1, 1)
        feature_range_poly = transformer.transform(feature_range)
        plt.plot(feature_range, model.predict(feature_range_poly), color='limegreen', linewidth=2, label='Polynomial Fit')
    else:
        plt.plot(features_train, model.predict(features_train), color='limegreen', linewidth=2, label='Linear Fit')
    
    plt.legend()
    plt.grid(True)
    plt.show()

# Main script
if __name__ == "__main__":
    full_trip_data = load_and_preprocess_data("Trips_Full_Data.csv", 'Date', 'Week of Date', 32)
    distance_trip_data = load_and_preprocess_data("Trips_by_Distance.csv", 'Date', 'Week', 32)

    if not full_trip_data.empty and not distance_trip_data.empty and 'Trips 1-3 Miles' in full_trip_data.columns and 'Number of Trips >=500' in distance_trip_data.columns:
        min_length = min(len(full_trip_data), len(distance_trip_data))
        trip_features = full_trip_data['Trips 1-3 Miles'].iloc[:min_length].values.reshape(-1, 1)
        trip_targets = distance_trip_data['Number of Trips >=500'].iloc[:min_length].values
        features_train, features_test, targets_train, targets_test = train_test_split(trip_features, trip_targets, test_size=0.2, random_state=42)
        
        models = fit_and_evaluate_models(features_train, features_test, targets_train, targets_test)
        models_summary = create_summary_table(models)
        
        print("Model Summary Table:")
        print(models_summary)
        
        print("\nPredictions from Polynomial Regression (Testing):")
        print(models['polynomial']['predictions']['test'])
        
        print("\nOLS Regression Results:")
        print(models['OLS']['summary'])

        plot_regression_results(features_train, targets_train, models['linear']['model'], None, 'Linear', 'Linear Regression')
        plot_regression_results(features_train, targets_train, models['polynomial']['model'], models['polynomial']['transformer'], 'Polynomial', 'Polynomial Regression')
    else:
        print("Check the input data and column names.")


In [None]:
#E

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv("Trips_Full_Data.csv")

# Aggregate the data by summing up the number of travelers for each distance range
distance_columns = [
   'Trips <1 Mile', 'Trips 1-3 Miles', 'Trips 3-5 Miles', 'Trips 5-10 Miles', 'Trips 10-25 Miles', 
                   'Trips 25-50 Miles', 'Trips 50-100 Miles', 'Trips 100-250 Miles', 'Trips 250-500 Miles', 
                   'Trips 500+ Miles'
                    ]
total_travelers_by_distance = data[distance_columns].sum()

# Plotting
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
colors = sns.color_palette("viridis", len(total_travelers_by_distance))
total_travelers_by_distance.plot(kind='bar', color=colors)
plt.title('Number of Participants by Distance-Trips', fontsize=16, fontweight='bold', color='black')
plt.xlabel('Distance Category', fontsize=14, fontweight='bold', color='black')
plt.ylabel('Number of Trips', fontsize=14, fontweight='bold', color='black')
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
plt.show()
