In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt

# check for required file
file_paths = ['../data/NYC_census_block_loc.csv', '../data/NYC_census_tracts.csv']

for file_path in file_paths:
    file = Path(file_path)

    # Check if the file exists
    if file.exists():
        print(f"File found: {file}")
    else:
        print(f"File not found: {file}. Please ensure the file is downloaded correctly.")


File found: ../data/NYC_census_block_loc.csv
File found: ../data/NYC_census_tracts.csv


In [2]:
census_block_loc_df = pd.read_csv(file_paths[0])
census_block_loc_df.head()

Unnamed: 0,Latitude,Longitude,BlockCode,County,State
0,40.48,-74.28,340230076002012,Middlesex,NJ
1,40.48,-74.276834,340230076005000,Middlesex,NJ
2,40.48,-74.273668,340230076003018,Middlesex,NJ
3,40.48,-74.270503,340230076003004,Middlesex,NJ
4,40.48,-74.267337,340230074021000,Middlesex,NJ


In [3]:
nyc_census_tracts_df = pd.read_csv(file_paths[1])

In [4]:
# Convert Tract and CensusTract to strings to ensure consistent formatting
# Reference: https://www.kaggle.com/code/muonneutrino/mapping-new-york-city-census-data/notebook
census_block_loc_df['Tract'] = census_block_loc_df.BlockCode // 10000
census_block_loc_df['Tract'] = census_block_loc_df['Tract'].astype(str)
nyc_census_tracts_df['CensusTract'] = nyc_census_tracts_df['CensusTract'].astype(str)

# If necessary, pad Tract with leading zeros (adjust the number of zeros as needed)
census_block_loc_df['Tract'] = census_block_loc_df['Tract'].str.zfill(6)
nyc_census_tracts_df['CensusTract'] = nyc_census_tracts_df['CensusTract'].str.zfill(6)

# Perform the merge
merged_df = pd.merge(
    nyc_census_tracts_df,
    census_block_loc_df,  
    left_on='CensusTract', 
    right_on='Tract', 
    how='left'
)

# Display the first few rows to verify
merged_df.head()

Unnamed: 0,CensusTract,County_x,Borough,TotalPop,Men,Women,Hispanic,White,Black,Native,...,OtherTransp,WorkAtHome,MeanCommute,Unemployment,Latitude,Longitude,BlockCode,County_y,State,Tract
0,36005000100,Bronx,Bronx,7703,7133,570,29.9,6.1,60.9,0.2,...,,,,,40.787538,-73.887437,360050000000000.0,Bronx,NY,36005000100
1,36005000100,Bronx,Bronx,7703,7133,570,29.9,6.1,60.9,0.2,...,,,,,40.787538,-73.884271,360050000000000.0,Bronx,NY,36005000100
2,36005000100,Bronx,Bronx,7703,7133,570,29.9,6.1,60.9,0.2,...,,,,,40.787538,-73.881106,360050000000000.0,Bronx,NY,36005000100
3,36005000100,Bronx,Bronx,7703,7133,570,29.9,6.1,60.9,0.2,...,,,,,40.787538,-73.87794,360050000000000.0,Bronx,NY,36005000100
4,36005000100,Bronx,Bronx,7703,7133,570,29.9,6.1,60.9,0.2,...,,,,,40.787538,-73.874774,360050000000000.0,Bronx,NY,36005000100


In [5]:
# Count the number of NaNs in each column
nan_counts = merged_df.isna().sum()

# Display the count of NaNs per column
nan_counts

CensusTract        0
County_x           0
Borough            0
TotalPop           0
Men                0
Women              0
Hispanic        5053
White           5053
Black           5053
Native          5053
Asian           5053
Citizen            0
Income          5760
IncomePerCap    5214
Poverty         5222
ChildPoverty    5651
Professional    5275
Service         5275
Office          5275
Construction    5275
Production      5275
Drive           5275
Carpool         5275
Transit         5275
Walk            5275
OtherTransp     5275
WorkAtHome      5275
MeanCommute     6188
Unemployment    5222
Latitude          15
Longitude         15
BlockCode         15
County_y          15
State             15
Tract             15
dtype: int64

In [6]:
merged_df.shape

(18067, 35)

In [7]:
# List of columns to drop
columns_to_drop = ['Hispanic', 'White', 'Black', 'Native', 'Asian', 'Income',
                   'IncomePerCap', 'Poverty', 'ChildPoverty', 'Professional', 
                   'Service', 'Office', 'Construction', 'Production', 'Drive', 
                   'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 
                   'MeanCommute', 'Unemployment', 'County_y', 'State', 'Tract', 
                   'County_x', 'CensusTract', 'BlockCode']

# Drop the specified columns from the DataFrame
merged_df = merged_df.drop(columns=columns_to_drop)

merged_df = merged_df.drop_duplicates()

merged_df = merged_df.rename(columns={'Latitude': 'latitude', 'Longitude': 'longitude'})


# Verify the columns have been dropped
merged_df.head()

Unnamed: 0,Borough,TotalPop,Men,Women,Citizen,latitude,longitude
0,Bronx,7703,7133,570,6476,40.787538,-73.887437
1,Bronx,7703,7133,570,6476,40.787538,-73.884271
2,Bronx,7703,7133,570,6476,40.787538,-73.881106
3,Bronx,7703,7133,570,6476,40.787538,-73.87794
4,Bronx,7703,7133,570,6476,40.787538,-73.874774


In [8]:
merged_df = merged_df.dropna()

# Count the number of NaNs in each column
nan_counts = merged_df.isna().sum()

# Display the count of NaNs per column
nan_counts

Borough      0
TotalPop     0
Men          0
Women        0
Citizen      0
latitude     0
longitude    0
dtype: int64

In [9]:
merged_df.describe()

Unnamed: 0,TotalPop,Men,Women,Citizen,latitude,longitude
count,18052.0,18052.0,18052.0,18052.0,18052.0,18052.0
mean,2911.090682,1411.314591,1499.776091,1944.21704,40.663946,-73.938182
std,2963.959357,1433.417834,1570.619628,2086.253957,0.108225,0.130484
min,0.0,0.0,0.0,0.0,40.48,-74.257839
25%,0.0,0.0,0.0,0.0,40.572714,-74.020402
50%,2627.0,1318.0,1346.0,1688.0,40.645075,-73.91593
75%,4586.0,2214.0,2382.0,2970.0,40.746834,-73.836784
max,28926.0,13460.0,15466.0,22905.0,40.916432,-73.700653


In [10]:
# Save the initial_df to a CSV file
merged_df.to_csv('../data/NYC_Census_Data_Merged_Cleaned.csv', index=False)

print("DataFrame saved to NYC_Census_Data_Merged_Cleaned.csv")

DataFrame saved to NYC_Census_Data_Merged_Cleaned.csv


In [11]:
# Import the MongoDBGeoHandler class
from mongo_handler import MongoDBGeoHandler

# Initialize the handler with a score column name
census_handler = MongoDBGeoHandler(db_name='nyc_data', collection_name='census_data', score_column_name='TotalPop')

# Insert the cleaned data into MongoDB
census_handler.insert_data(merged_df)

# Query for the number of records and average score within a 5000-meter radius of a specific point
result = census_handler.count_records_within_radius(40.730610, -73.935242, 5000)
print(f"Number of records within the radius: {result['count']}")
print(f"Average TotalPop within the radius: {result['average_score']}")

Data has been successfully inserted into MongoDB in the 'nyc_data' database, 'census_data' collection.
Total documents inserted: 18052
Number of records within the radius: 1166
Average TotalPop within the radius: 3883.8902229845626
