In [1]:
import pandas as pd

# Load the datasets with pandas for initial exploration
trips_distance_df = pd.read_csv('Trips_by_Distance.csv')
trips_full_df = pd.read_csv('Trips_Full Data.csv')

# Identifying independent variables (e.g., 'Week', 'Month') and dependent variables (e.g., 'Population Staying at Home')

# Imputation with the mean (useful for numerical columns with a normal distribution and no outliers)
trips_distance_df.fillna(trips_distance_df.mean(numeric_only=True), inplace=True)
trips_full_df.fillna(trips_full_df.mean(numeric_only=True), inplace=True)

# Imputation with the median (useful for numerical columns with outliers or non-normal distributions)
trips_distance_df.fillna(trips_distance_df.median(numeric_only=True), inplace=True)
trips_full_df.fillna(trips_full_df.median(numeric_only=True), inplace=True)

# Imputation with the mode (useful for categorical columns or discrete numerical columns)
# Mode can return multiple values if there's a tie, so we take the first mode with [0]
mode_values_distance = trips_distance_df.mode().iloc[0]
mode_values_full = trips_full_df.mode().iloc[0]
trips_distance_df.fillna(mode_values_distance, inplace=True)
trips_full_df.fillna(mode_values_full, inplace=True)

# After imputation, checking that no missing values remain
print("Missing values after imputation in Trips by Distance:")
print(trips_distance_df.isnull().sum())
print("Missing values after imputation in Full Trips:")
print(trips_full_df.isnull().sum())



Missing values after imputation in Trips by Distance:
Level                             0
Date                              0
State FIPS                        0
State Postal Code                 0
County FIPS                       0
County Name                       0
Population Staying at Home        0
Population Not Staying at Home    0
Number of Trips                   0
Number of Trips <1                0
Number of Trips 1-3               0
Number of Trips 3-5               0
Number of Trips 5-10              0
Number of Trips 10-25             0
Number of Trips 25-50             0
Number of Trips 50-100            0
Number of Trips 100-250           0
Number of Trips 250-500           0
Number of Trips >=500             0
Row ID                            0
Week                              0
Month                             0
dtype: int64
Missing values after imputation in Full Trips:
Month of Date                 0
Week of Date                  0
Year of Date                  

In [14]:
from dask import dataframe as dd

# Load the datasets as Dask DataFrames for out-of-core computation
trips_distance_ddf = dd.read_csv('Trips_by_Distance.csv', dtype=dtypes)
trips_full_ddf = dd.read_csv('Trips_Full Data.csv', dtype=dtypes)



# Selecting only existing numeric columns for mean calculation
existing_cols_distance = [col for col in numeric_cols_distance if col in trips_distance_ddf.columns]
existing_cols_full = [col for col in numeric_cols_full if col in trips_full_ddf.columns]

# Calculating mean for existing numeric columns and print the results
mean_values_distance = trips_distance_ddf[existing_cols_distance].mean().compute()
mean_values_full = trips_full_ddf[existing_cols_full].mean().compute()

print("Mean values for Trips by Distance:")
print(mean_values_distance)
print("Mean values for Trips Full Data:")
print(mean_values_full)

# Filling missing numeric values with the computed mean and verify the operation
for col in existing_cols_distance:
    trips_distance_ddf[col] = trips_distance_ddf[col].fillna(mean_values_distance[col])

for col in existing_cols_full:
    trips_full_ddf[col] = trips_full_ddf[col].fillna(mean_values_full[col])




print("Numeric columns selected for 'Trips by Distance':", existing_cols_distance)
print("Numeric columns selected for 'Trips Full Data':", existing_cols_full)

# Computing and printing mean values for 'Trips by Distance'
mean_values_distance = trips_distance_ddf[existing_cols_distance].mean().compute()
print("Computed mean values for 'Trips by Distance' with Dask:", mean_values_distance)

# Computing and printing mean values for 'Trips Full Data'
mean_values_full = trips_full_ddf[existing_cols_full].mean().compute()
print("Computed mean values for 'Trips Full Data' with Dask:", mean_values_full)





Mean values for Trips by Distance:
Number of Trips                   2.477794e+06
Number of Trips 1-3               6.191970e+05
Number of Trips 10-25             3.759380e+05
Number of Trips 100-250           1.582453e+04
Number of Trips 25-50             1.208128e+05
Number of Trips 250-500           3.553210e+03
Number of Trips 3-5               3.040894e+05
Number of Trips 5-10              3.843100e+05
Number of Trips 50-100            3.884712e+04
Number of Trips <1                6.120798e+05
Number of Trips >=500             3.142248e+03
Population Not Staying at Home    5.234714e+05
Population Staying at Home        1.462917e+05
dtype: float64
Mean values for Trips Full Data:
Population Staying at Home    6.306337e+07
dtype: float64
Numeric columns selected for 'Trips by Distance': ['Number of Trips', 'Number of Trips 1-3', 'Number of Trips 10-25', 'Number of Trips 100-250', 'Number of Trips 25-50', 'Number of Trips 250-500', 'Number of Trips 3-5', 'Number of Trips 5-10', 'Num