In [4]:
from dask import dataframe as dd
# Define dtypes for columns
dtypes = {
    'County Name': 'object',
    'Number of Trips': 'float64',
    'Number of Trips 1-3': 'float64',
    'Number of Trips 10-25': 'float64',
    'Number of Trips 100-250': 'float64',
    'Number of Trips 25-50': 'float64',
    'Number of Trips 250-500': 'float64',
    'Number of Trips 3-5': 'float64',
    'Number of Trips 5-10': 'float64',
    'Number of Trips 50-100': 'float64',
    'Number of Trips <1': 'float64',
    'Number of Trips >=500': 'float64',
    'Population Not Staying at Home': 'float64',
    'Population Staying at Home': 'float64',
    'State Postal Code': 'object'
}



# Load the data using Dask, which is capable of handling larger-than-memory datasets
trips_distance_ddf = dd.read_csv('Trips_by_Distance.csv', dtype=dtypes)
trips_full_ddf = dd.read_csv('Trips_Full Data.csv', dtype=dtypes)

# Compute the mean of numeric columns only
mean_values_distance = trips_distance_ddf.select_dtypes(include=['float64', 'int64']).mean().compute()
mean_values_full = trips_full_ddf.select_dtypes(include=['float64', 'int64']).mean().compute()

# Apply the mean to fill NaN values for numeric columns
trips_distance_ddf = trips_distance_ddf.fillna(mean_values_distance)
trips_full_ddf = trips_full_ddf.fillna(mean_values_full)

# If you have categorical data or non-numeric data, you should compute and fill with the mode or another appropriate value
# For example, to fill with mode for non-numeric columns, you can do the following:
mode_values_distance = trips_distance_ddf.select_dtypes(include=['object']).mode().compute().iloc[0]
mode_values_full = trips_full_ddf.select_dtypes(include=['object']).mode().compute().iloc[0]

for col in mode_values_distance.index:
    trips_distance_ddf[col] = trips_distance_ddf[col].fillna(mode_values_distance[col])

for col in mode_values_full.index:
    trips_full_ddf[col] = trips_full_ddf[col].fillna(mode_values_full[col])

# Now, let's compute the final Dask DataFrames to get pandas DataFrames
trips_distance_df = trips_distance_ddf.compute()
trips_full_df = trips_full_ddf.compute()

# Checking again for any remaining missing values
print("Missing values after imputation in Trips by Distance:")
print(trips_distance_df.isnull().sum())
print("Missing values after imputation in Full Trips:")
print(trips_full_df.isnull().sum())


Missing values after imputation in Trips by Distance:
Level                             0
Date                              0
State FIPS                        0
State Postal Code                 0
County FIPS                       0
County Name                       0
Population Staying at Home        0
Population Not Staying at Home    0
Number of Trips                   0
Number of Trips <1                0
Number of Trips 1-3               0
Number of Trips 3-5               0
Number of Trips 5-10              0
Number of Trips 10-25             0
Number of Trips 25-50             0
Number of Trips 50-100            0
Number of Trips 100-250           0
Number of Trips 250-500           0
Number of Trips >=500             0
Row ID                            0
Week                              0
Month                             0
dtype: int64
Missing values after imputation in Full Trips:
Month of Date                 0
Week of Date                  0
Year of Date                  