In [1]:
import pandas as pd
import boto3
import io
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Set the maximum number of rows and Columns to 200
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)

In [2]:
s3R = boto3.resource("s3")
s3 = boto3.client("s3")
bucket_name = "capstonehaystacks"
# Listing the contents of the S3 bucket
for obj in s3R.Bucket(bucket_name).objects.all():
    print(obj.key)

ACSDP5Y2021.DP04-Data.csv
ACSST5Y2021.S0101-Data.csv
ACSST5Y2021.S0801-Data.csv
ACSST5Y2021.S1901-Data.csv
GA_LISTINGS_SALES.csv
GA_LISTINGS_SALES_V2.csv
GA_listing.csv
all_zips_grocery_store.json
all_zips_restaurant.json
atlanta-geo.csv
atlanta_cbsa_zip.csv
atlanta_hdma_2021.csv
atlanta_hdma_census_2021.csv
atlanta_hdma_tract_2021.csv
atlanta_listings.csv
census_all.csv
census_all_perCapita.csv
census_data
core_geo_dataset.csv
crime.csv
crime_rating_zipcode.csv
elementary_schools.csv
ga_georgia_zip_codes_geo.min.json
high_schools.csv
hmda_2017_ga_all-records_labels.csv
hmda_2021_ga_all-records_labels.csv
jason_listing.csv
listing-with-poi-distances.csv
listings_with_tract.csv
middle_schools.csv
poi-google-exploded.csv
poi_combined_haystack.csv
poi_combined_haystack_ALL.csv
poi_combined_haystack_ALL_CLEANED.csv
poi_expanded.csv
points-of-interest-google.csv
points-of-interest-google2.csv
points-of-interest-google3.csv
points-of-interest-haystacks.csv
schools.csv
zip_summary.csv
zipcode

In [3]:
file_name = "census_all.csv"

# Downloading the file from S3 and parsing it into a dataframe
s3.download_file(bucket_name, file_name, file_name)

# Low memory is set to False because columns have mixed data types
census = pd.read_csv(file_name, index_col=False, low_memory=False)

In [4]:
file_name = "census_all_perCapita.csv"

# Downloading the file from S3 and parsing it into a dataframe
s3.download_file(bucket_name, file_name, file_name)

# Low memory is set to False because columns have mixed data types
census_capita = pd.read_csv(file_name, index_col=False, low_memory=False)

In [5]:
file_name = "atlanta_hdma_2021.csv"

# Downloading the file from S3 and parsing it into a dataframe
s3.download_file(bucket_name, file_name, file_name)

# Low memory is set to False because columns have mixed data types
hdma = pd.read_csv(file_name, index_col=False, low_memory=False)

In [6]:
census_capita.shape

(207, 800)

In [7]:
# filling missing values with 0 which are the restaurants above 4.7 rating and below 4.7 rating
census_capita = census_capita.fillna(0)

In [8]:
# renaming first column
hdma = hdma.rename(columns={hdma.columns[0]: 'zipcode'})

In [10]:
# merging hdma data to census capita
merged_df = pd.merge(census_capita, hdma[['zipcode', 'approval_percentage']], on='zipcode', how='left', suffixes=('', '_hdma'))



In [12]:
# moving columns positions

# list of columns to move them
cols = list(merged_df.columns)

# moving column at index 801 to index 1
cols.insert(1, cols.pop(800))

# reindexing
merged_df = merged_df[cols]

In [13]:
merged_df

Unnamed: 0,zipcode,approval_percentage,restaurant_above_4.7_percentage,total_workers,car_commute,travel_less_10,travel_10_14,travel_15_19,travel_20_24,travel_25_29,travel_30_34,travel_35_44,travel_45_59,travel_more_60,travel_mean,population,percent_male,percent_under_15,percent_teen_15_19,percent_college_20_24,percent_25_39,percent_40-59,percent_over_60,total_units,rental_vacancy_rate,median_homeowner_value,median_rental_value,percent_owner_occupied,percent_after_2019,rent_less_15_percent_income,rent_over_30_percent,rent_15_30_percent,rent_less_999,rent_1000_2500,rent_over_2500,gross_rental_yield,total_households,percent_less_10k,percent_10k_15k,percent_15k_25k,percent_25k_35k,percent_35k_50k,percent_50k_75k,percent_75k_100k,percent_100k_150k,percent_150k_200k,percent_more_200k,household_median_income,household_mean_income,accountant_per_Capita,addiction_treatment_center_per_Capita,advertising_agency_per_Capita,after_school_program_per_Capita,air_conditioning_contractor_per_Capita,air_conditioning_repair_service_per_Capita,air_conditioning_system_supplier_per_Capita,air_duct_cleaning_service_per_Capita,allergist_per_Capita,alternative_fuel_station_per_Capita,alternative_medicine_practitioner_per_Capita,american_restaurant_per_Capita,amusement_center_per_Capita,animal_control_service_per_Capita,animal_hospital_per_Capita,antique_furniture_store_per_Capita,antique_store_per_Capita,apartment_building_per_Capita,apartment_complex_per_Capita,apartment_rental_agency_per_Capita,appliance_repair_service_per_Capita,appliance_store_per_Capita,art_center_per_Capita,art_gallery_per_Capita,asian_fusion_restaurant_per_Capita,asian_restaurant_per_Capita,assisted_living_facility_per_Capita,association_or_organization_per_Capita,atm_per_Capita,attorney_per_Capita,audiologist_per_Capita,auto_air_conditioning_service_per_Capita,auto_body_parts_supplier_per_Capita,auto_body_shop_per_Capita,auto_broker_per_Capita,auto_dent_removal_service_per_Capita,auto_electrical_service_per_Capita,auto_glass_shop_per_Capita,auto_insurance_agency_per_Capita,auto_parts_store_per_Capita,auto_radiator_repair_service_per_Capita,auto_repair_shop_per_Capita,auto_restoration_service_per_Capita,auto_tune_up_service_per_Capita,auto_wrecker_per_Capita,automobile_storage_facility_per_Capita,baby_clothing_store_per_Capita,baby_store_per_Capita,bagel_shop_per_Capita,bail_bonds_service_per_Capita,bakery_per_Capita,...,taco_restaurant_per_Capita,tailor_per_Capita,takeout_restaurant_per_Capita,tanning_salon_per_Capita,tattoo_shop_per_Capita,tax_consultant_per_Capita,tax_preparation_per_Capita,tax_preparation_service_per_Capita,tea_house_per_Capita,teeth_whitening_service_per_Capita,telecommunications_service_provider_per_Capita,telephone_company_per_Capita,temp_agency_per_Capita,tennis_court_per_Capita,tex-mex_restaurant_per_Capita,thai_restaurant_per_Capita,thrift_store_per_Capita,tile_store_per_Capita,tire_shop_per_Capita,tobacco_shop_per_Capita,tool_rental_service_per_Capita,tool_store_per_Capita,topsoil_supplier_per_Capita,tourist_attraction_per_Capita,towing_service_per_Capita,townhouse_complex_per_Capita,toy_store_per_Capita,traditional_american_restaurant_per_Capita,trailer_dealer_per_Capita,trailer_rental_service_per_Capita,trailer_repair_shop_per_Capita,trailer_supply_store_per_Capita,transmission_shop_per_Capita,transportation_service_per_Capita,travel_agency_per_Capita,tree_service_per_Capita,trial_attorney_per_Capita,truck_accessories_store_per_Capita,truck_dealer_per_Capita,truck_parts_supplier_per_Capita,truck_rental_agency_per_Capita,truck_repair_shop_per_Capita,truck_stop_per_Capita,trucking_company_per_Capita,tuxedo_shop_per_Capita,uniform_store_per_Capita,united_methodist_church_per_Capita,university_per_Capita,upholstery_cleaning_service_per_Capita,upholstery_shop_per_Capita,urgent_care_center_per_Capita,urologist_per_Capita,used_auto_parts_store_per_Capita,used_car_dealer_per_Capita,used_tire_shop_per_Capita,used_truck_dealer_per_Capita,van_rental_agency_per_Capita,vaporizer_store_per_Capita,variety_store_per_Capita,vegan_restaurant_per_Capita,vegetarian_restaurant_per_Capita,veterans_organization_per_Capita,veterinarian_per_Capita,video_arcade_per_Capita,video_game_store_per_Capita,video_production_service_per_Capita,vietnamese_restaurant_per_Capita,vitamin_&_supplements_store_per_Capita,walk-in_clinic_per_Capita,wallpaper_store_per_Capita,warehouse_per_Capita,waste_management_service_per_Capita,watch_repair_service_per_Capita,watch_store_per_Capita,water_damage_restoration_service_per_Capita,waxing_hair_removal_service_per_Capita,website_designer_per_Capita,wedding_photographer_per_Capita,wedding_planner_per_Capita,wedding_service_per_Capita,wedding_store_per_Capita,wedding_venue_per_Capita,weight_loss_service_per_Capita,wellness_center_per_Capita,wellness_program_per_Capita,wheel_alignment_service_per_Capita,wheel_store_per_Capita,wholesaler_per_Capita,wig_shop_per_Capita,window_installation_service_per_Capita,window_tinting_service_per_Capita,window_treatment_store_per_Capita,wine_bar_per_Capita,wine_store_per_Capita,womens_clothing_store_per_Capita,womens_health_clinic_per_Capita,wood_floor_installation_service_per_Capita,yoga_studio_per_Capita,restaurant_above_4.7_per_Capita,restaurant_below_4.7_per_Capita
0,30002,87.51,11.111111,3221,77.1,5.1,9.6,14.1,15.7,3.6,24.0,10.2,8.8,8.9,29.7,6025,49.0,19.4,1.6,2.1,25.4,27.6,24.1,3149,7.0,432400.0,933.0,56.5,21.0,12.4,64.2,23.5,72.5,26.7,0.8,2.589269,2810,2.1,9.3,6.7,13.5,5.2,10.7,12.8,15.1,11.5,13.2,79091.0,114933.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.331950,0.000000,0.000000,0.165975,0.000000,0.000000,0.331950,0.165975,0.165975,0.000000,0.000000,0.165975,0.165975,0.000000,0.165975,0.000000,0.000000,0.165975,0.000000,0.000000,0.000000,0.000000,0.165975,0.0,0.000000,0.000000,0.000000,0.165975,0.000000,0.000000,0.000000,0.000000,0.000000,0.165975,0.000000,0.000000,0.000000,0.000000,0.165975,0.165975,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.331950,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.165975,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.165975,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.165975,0.165975,0.331950,0.165975,0.0,0.000000,0.165975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.165975,0.000000,0.000000,0.000000,0.000000,0.165975,1.327801
1,30004,90.60,9.523810,34618,75.4,10.6,11.9,14.7,15.3,3.1,9.8,7.9,12.8,13.7,29.9,66315,49.0,22.3,7.3,2.7,18.2,35.3,14.1,24391,4.5,444200.0,1478.0,76.4,10.3,9.7,36.7,53.5,7.9,85.0,7.0,3.992796,23933,3.0,1.6,2.8,2.6,6.6,8.2,14.1,17.9,13.6,29.6,129724.0,176246.0,0.000000,0.000000,0.000000,0.01508,0.090477,0.060318,0.000000,0.030159,0.000000,0.015080,0.000000,0.105557,0.000000,0.030159,0.060318,0.000000,0.000000,0.045239,0.045239,0.000000,0.015080,0.045239,0.000000,0.015080,0.045239,0.150795,0.000000,0.015080,0.211114,0.030159,0.000000,0.030159,0.015080,0.090477,0.0,0.030159,0.015080,0.000000,0.060318,0.180955,0.030159,0.437307,0.015080,0.060318,0.015080,0.015080,0.000000,0.000000,0.045239,0.000000,0.150795,...,0.045239,0.0,0.256352,0.030159,0.0,0.0,0.000000,0.000000,0.000000,0.015080,0.045239,0.075398,0.000000,0.015080,0.060318,0.060318,0.000000,0.045239,0.180955,0.015080,0.030159,0.045239,0.045239,0.015080,0.045239,0.030159,0.045239,0.000000,0.0,0.000000,0.000000,0.030159,0.045239,0.000000,0.000000,0.075398,0.000000,0.060318,0.000000,0.0,0.030159,0.01508,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.180955,0.015080,0.030159,0.000000,0.030159,0.01508,0.030159,0.045239,0.015080,0.075398,0.000000,0.0,0.030159,0.015080,0.045239,0.000000,0.015080,0.030159,0.030159,0.000000,0.0,0.000000,0.045239,0.000000,0.0,0.030159,0.000000,0.000000,0.075398,0.030159,0.015080,0.01508,0.030159,0.030159,0.075398,0.000000,0.000000,0.030159,0.030159,0.045239,0.045239,0.000000,0.015080,0.015080,0.015080,0.060318,0.573023
2,30005,91.29,5.128205,21241,71.2,8.8,15.0,13.6,16.9,4.2,11.1,6.6,10.5,13.3,28.7,40309,51.0,22.1,8.2,4.0,16.9,34.1,14.7,14248,8.8,464100.0,1712.0,71.9,11.8,17.4,36.3,46.2,1.1,91.3,7.6,4.426632,13498,1.7,1.2,2.4,1.8,4.2,7.7,9.4,21.2,13.9,36.6,152326.0,181245.0,0.049617,0.000000,0.049617,0.00000,0.024808,0.000000,0.024808,0.024808,0.099233,0.024808,0.024808,0.297700,0.024808,0.024808,0.024808,0.000000,0.000000,0.223275,0.198467,0.074425,0.000000,0.000000,0.024808,0.000000,0.024808,0.099233,0.024808,0.024808,0.148850,0.124042,0.049617,0.024808,0.024808,0.000000,0.0,0.000000,0.024808,0.000000,0.124042,0.024808,0.000000,0.074425,0.000000,0.024808,0.000000,0.000000,0.000000,0.000000,0.099233,0.000000,0.248084,...,0.049617,0.0,0.396934,0.024808,0.0,0.0,0.049617,0.000000,0.024808,0.074425,0.074425,0.000000,0.074425,0.000000,0.000000,0.024808,0.000000,0.074425,0.049617,0.024808,0.000000,0.000000,0.000000,0.124042,0.000000,0.024808,0.000000,0.049617,0.0,0.000000,0.000000,0.000000,0.000000,0.074425,0.000000,0.000000,0.024808,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.049617,0.0,0.024808,0.00000,0.0,0.000000,0.0,0.0,0.024808,0.024808,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.148850,0.124042,0.000000,0.049617,0.000000,0.0,0.024808,0.000000,0.024808,0.024808,0.024808,0.024808,0.000000,0.024808,0.0,0.024808,0.024808,0.024808,0.0,0.000000,0.024808,0.000000,0.074425,0.049617,0.049617,0.00000,0.049617,0.000000,0.000000,0.000000,0.049617,0.024808,0.024808,0.024808,0.049617,0.024808,0.049617,0.000000,0.000000,0.049617,0.917909
3,30008,85.11,0.000000,15093,83.3,5.6,6.0,13.6,20.3,4.9,22.8,7.9,9.0,10.0,30.5,35023,49.0,21.3,6.1,8.0,23.7,25.8,15.2,12420,2.0,200700.0,1195.0,60.2,8.8,7.5,63.2,29.3,32.7,67.4,0.0,7.144993,11648,6.5,3.3,9.1,9.6,11.2,20.2,12.0,18.3,7.1,2.6,60739.0,74072.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.028553,0.256974,0.000000,0.028553,0.028553,0.000000,0.000000,0.171316,0.171316,0.057105,0.000000,0.028553,0.000000,0.000000,0.000000,0.057105,0.028553,0.028553,0.057105,0.057105,0.000000,0.000000,0.028553,0.085658,0.0,0.057105,0.028553,0.028553,0.028553,0.114211,0.000000,0.428290,0.028553,0.028553,0.028553,0.057105,0.000000,0.000000,0.000000,0.142763,0.028553,...,0.000000,0.0,0.142763,0.000000,0.0,0.0,0.000000,0.057105,0.000000,0.000000,0.000000,0.057105,0.000000,0.028553,0.000000,0.000000,0.000000,0.000000,0.228421,0.114211,0.028553,0.000000,0.000000,0.028553,0.057105,0.028553,0.057105,0.000000,0.0,0.028553,0.000000,0.028553,0.114211,0.000000,0.000000,0.000000,0.028553,0.057105,0.000000,0.0,0.028553,0.00000,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.028553,0.0,0.0,0.000000,0.000000,0.057105,0.000000,0.000000,0.057105,0.028553,0.00000,0.000000,0.000000,0.000000,0.057105,0.028553,0.0,0.028553,0.000000,0.028553,0.000000,0.000000,0.000000,0.057105,0.000000,0.0,0.028553,0.000000,0.028553,0.0,0.000000,0.000000,0.000000,0.028553,0.000000,0.000000,0.00000,0.028553,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.028553,0.000000,0.000000,0.000000,0.000000,0.000000,0.428290
4,30009,90.17,18.367347,10425,65.5,14.7,16.1,16.4,12.7,4.3,8.9,2.9,12.2,11.8,27.0,19922,52.0,20.3,5.0,7.0,16.8,30.8,20.1,8278,7.1,456300.0,1764.0,63.7,19.3,14.0,40.3,45.7,10.4,66.1,23.5,4.639053,7563,1.8,2.3,4.6,4.6,8.0,8.9,9.6,15.2,14.7,30.3,136384.0,178231.0,0.000000,0.050196,0.000000,0.00000,0.050196,0.000000,0.000000,0.050196,0.050196,0.050196,0.000000,0.702741,0.000000,0.000000,0.100392,0.050196,0.050196,0.100392,0.150587,0.000000,0.050196,0.050196,0.000000,0.100392,0.000000,0.100392,0.000000,0.050196,0.401566,0.050196,0.000000,0.100392,0.000000,0.150587,0.0,0.000000,0.050196,0.000000,0.050196,0.250979,0.050196,0.401566,0.050196,0.050196,0.000000,0.150587,0.050196,0.000000,0.000000,0.000000,0.250979,...,0.000000,0.0,0.351370,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.050196,0.100392,0.050196,0.000000,0.000000,0.100392,0.050196,0.000000,0.000000,0.401566,0.050196,0.100392,0.000000,0.000000,0.150587,0.050196,0.050196,0.000000,0.000000,0.0,0.050196,0.000000,0.050196,0.200783,0.050196,0.050196,0.000000,0.050196,0.100392,0.100392,0.0,0.100392,0.00000,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.451762,0.050196,0.050196,0.100392,0.100392,0.00000,0.150587,0.100392,0.050196,0.050196,0.000000,0.0,0.000000,0.050196,0.050196,0.000000,0.000000,0.000000,0.050196,0.000000,0.0,0.150587,0.351370,0.150587,0.0,0.050196,0.100392,0.050196,0.150587,0.050196,0.301175,0.00000,0.200783,0.050196,0.000000,0.100392,0.050196,0.150587,0.100392,0.150587,0.050196,1.054111,0.000000,0.100392,0.050196,0.451762,2.007831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,31085,85.89,0.000000,445,88.8,9.7,7.0,3.8,10.6,3.2,22.2,5.7,6.6,31.2,43.1,1150,51.0,9.0,6.0,3.7,14.3,29.4,37.5,455,0.0,81800.0,943.0,91.3,1.4,0.0,96.8,3.2,100.0,0.0,0.0,13.833741,414,14.7,2.7,3.1,22.9,12.3,8.0,13.0,15.7,1.9,5.6,37500.0,68342.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
203,31097,82.16,0.000000,519,94.6,11.2,6.4,19.5,24.9,2.4,13.7,2.2,12.7,7.0,28.1,1437,50.0,18.3,3.8,9.0,14.8,22.5,31.5,613,0.0,146900.0,663.0,86.4,2.5,61.1,30.5,8.4,88.9,5.6,5.6,5.415929,478,3.1,1.7,7.5,5.0,10.3,36.8,17.4,11.7,3.1,3.3,59275.0,74733.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.695894,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
204,31816,75.15,0.000000,1846,89.8,25.8,14.5,5.2,2.8,2.6,5.9,16.4,18.9,7.7,30.7,4956,44.0,22.2,11.7,3.4,16.7,23.0,23.0,2577,0.0,110700.0,833.0,58.3,7.5,16.3,65.8,18.0,87.8,12.1,0.0,9.029810,2066,15.4,6.1,11.7,10.3,18.9,14.0,11.7,9.1,1.0,1.7,37280.0,54382.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.403551,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.201776,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.605327,0.201776,0.000000,0.000000,0.000000,0.201776,0.0,0.000000,0.000000,0.000000,0.201776,0.201776,0.000000,0.403551,0.000000,0.000000,0.201776,0.000000,0.000000,0.201776,0.000000,0.000000,0.000000,...,0.000000,0.0,0.201776,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.403551,0.201776,0.000000,0.201776,0.000000,0.000000,0.403551,0.000000,0.403551,0.000000,0.0,0.000000,0.201776,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.201776,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.201776,0.000000,0.000000,0.000000,0.000000,0.403551
205,31822,81.78,21.428571,2502,91.8,11.6,9.4,10.2,13.7,7.4,24.5,4.5,10.3,8.4,28.4,5899,49.0,16.6,6.1,4.9,17.4,24.0,31.0,2756,1.8,165900.0,794.0,79.2,6.6,18.8,39.6,41.7,88.0,8.7,3.3,5.743219,2316,9.0,4.7,10.0,9.6,16.3,14.6,9.6,18.0,1.2,6.9,52813.0,76390.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.508561,0.000000,0.000000,0.000000,0.169520,0.678081,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.339041,0.000000,0.000000,0.000000,0.000000,0.508561,0.000000,0.000000,0.000000,0.000000,0.169520,0.0,0.000000,0.000000,0.000000,0.000000,0.339041,0.000000,0.169520,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.169520,...,0.000000,0.0,0.169520,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.339041,0.000000,0.000000,0.169520,0.000000,0.169520,0.000000,1.695203,0.000000,0.000000,0.169520,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.16952,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.16952,0.000000,0.000000,0.000000,0.169520,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.339041,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.169520,0.000000,0.000000,0.000000,0.000000,0.000000,0.508561,1.864723


In [None]:
# looking at column names and corrosponding numbers

for idx, column_name in enumerate(merged_df.columns):
    print(f"Column {idx}: {column_name}")

In [None]:
# dropping a few columns that are not represented in percentages such as population and income
merged_df = merged_df.drop(merged_df.columns[[2, 3, 15, 23, 25, 26, 36, 47, 48, ]], axis=1)

In [None]:
# making the zip code column the index
merged_df.set_index(merged_df.columns[0], inplace=True)


In [None]:
merged_df.describe()


In [None]:
# pd.set_option("display.max_rows", None)
pd.set_option("display.max_rows", 200)

In [None]:
correlation = merged_df.corr()['gross_rental_yield']
correlation.sort_values(ascending=False)

In [None]:
correlation = merged_df.corr()['approval_percentage']
correlation.sort_values(ascending=False)


## Transforming data

In [None]:
# converting the first 29 columns into percentages by dividing by 100
columns_to_divide = list(range(0, 39))
merged_df.iloc[:, columns_to_divide] = merged_df.iloc[:, columns_to_divide] / 100


In [None]:
# adding a new column with total poi's per 1000
merged_df['total_poi_per_1000'] = merged_df.iloc[:, 39:791].sum(axis=1)


In [None]:
# converting POI columns to percent of total. 

columns_to_convert = list(range(39, 791))
total_sum = merged_df.iloc[:, columns_to_convert].sum(axis=1)
merged_df.iloc[:, columns_to_convert] = merged_df.iloc[:, columns_to_convert].div(total_sum, axis=0)


In [None]:
# dropping last column
merged_df = merged_df.drop(columns=merged_df.columns[-1])


In [None]:
pd.set_option("display.max_rows", 200)
merged_df.dtypes

In [None]:
# looking into POI's relationship to zip code features

features = merged_df.iloc[:, 0:40]  # Select columns 0-39
target = merged_df['approval_percentage']



# training the linear regression model and evaluate for each column
for col in features.columns:
    X_train, X_test, y_train, y_test = train_test_split(features[col].values.reshape(-1, 1), target, test_size=0.2, random_state=42)

    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    score = regression_model.score(X_test, y_test)
    print(f"Column {col} - R-squared: {score}")


In [None]:
# seperating POI information from census info

target_list = [
    "approval_percentage",
    "car_commute",
    "travel_less_10",
    "travel_10_14",
    "travel_15_19",
    "travel_20_24",
    "travel_25_29",
    "travel_30_34",
    "travel_35_44",
    "travel_45_59",
    "travel_more_60",
    "travel_mean",
    "percent_male",
    "percent_under_15",
    "percent_teen_15_19",
    "percent_college_20_24",
    "percent_25_39",
    "percent_40-59",
    "percent_over_60",
    "rental_vacancy_rate",
    "percent_owner_occupied",
    "percent_after_2019",
    "rent_less_15_percent_income",
    "rent_over_30_percent",
    "rent_15_30_percent",
    "rent_less_999",
    "rent_1000_2500",
    "rent_over_2500",
    "gross_rental_yield",
    "percent_less_10k",
    "percent_10k_15k",
    "percent_15k_25k",
    "percent_25k_35k",
    "percent_35k_50k",
    "percent_50k_75k",
    "percent_75k_100k",
    "percent_100k_150k",
    "percent_150k_200k",
    "percent_more_200k"
]


## looking into correlations of POI data with target_list

In [None]:

# list of columns not in target list
other_columns = [column for column in merged_df.columns if column not in target_list]

# correlation between target_list and other_columns
correlation_results = {}

for target_column in target_list:
    correlation_results[target_column] = {}
    for other_column in other_columns:
        correlation = merged_df[target_column].corr(merged_df[other_column])
        correlation_results[target_column][other_column] = correlation

# column with the highest correlation for each column in target_list
highest_correlation = []
for target_column in target_list:
    max_corr_column = max(correlation_results[target_column], key=correlation_results[target_column].get)
    max_corr_value = correlation_results[target_column][max_corr_column]
    highest_correlation.append((target_column, max_corr_column, max_corr_value))

# column with the lowest correlation for each column in target_list
lowest_correlation = []
for target_column in target_list:
    min_corr_column = min(correlation_results[target_column], key=correlation_results[target_column].get)
    min_corr_value = correlation_results[target_column][min_corr_column]
    lowest_correlation.append((target_column, min_corr_column, min_corr_value))

# sorting
highest_correlation.sort(key=lambda x: x[2], reverse=True)
lowest_correlation.sort(key=lambda x: x[2], reverse=True)

# printing results
print("Columns in target_list with highest correlation:")
for pair in highest_correlation:
    print(pair[0], "to", pair[1], "- Correlation:", pair[2])

print("\nColumns in target_list with lowest correlation:")
for pair in lowest_correlation:
    print(pair[0], "to", pair[1], "- Correlation:", pair[2])


In [None]:


# list of columns not in target lists
other_columns = [column for column in merged_df.columns if column not in target_list]

# correlation between columns in target_list and other_columns
correlation_results = {}

for target_column in target_list:
    correlation_results[target_column] = {}
    for other_column in other_columns:
        correlation = merged_df[target_column].corr(merged_df[other_column])
        if abs(correlation) > 0.5:
            correlation_results[target_column][other_column] = correlation

# printing the columns with correlations above 0.6 or below -0.6
for target_column in correlation_results:
    print("Correlations for", target_column)
    for other_column, correlation in correlation_results[target_column].items():
        print(target_column, "to", other_column, "- Correlation:", correlation)


## Seperating census data from POI data

In [None]:
# dataframe with columns in target_list
target_df = merged_df[target_list]

# dataframe with columns not in target_list
features_df = merged_df.drop(target_list, axis=1)

# KNN

In [None]:
# Creating categories from continous data

# histogram of approval percentage
plt.hist(target_df['approval_percentage'], bins=10, edgecolor='black')

# labels and title
plt.xlabel('Approval Percentage')
plt.ylabel('Frequency')
plt.title('Distribution of Approval Percentage')


plt.show()


In [None]:

# defining the category ranges and names
category_ranges = [0, 0.8, 0.85, 0.9, 1]
category_names = ['approv_below_80', 'approv_80_85', 'approv_85_90', 'approve_above_90']

# creating copy of the dataframe and assign category labels based on ranges
target_df_copy = target_df.copy()
target_df_copy['approval_category'] = pd.cut(target_df_copy['approval_percentage'], bins=category_ranges, labels=category_names)


In [None]:
# KNN

X = features_df  # POI columns using to predict target
y = target_df_copy['approval_category']  # new categorical target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3)  # creating an instance of KNeighborsClassifier
knn.fit(X_train_scaled, y_train)  # fitting the model to the training data

y_pred = knn.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1) # zero_division equal to one to prevent errors
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


In [None]:
pd.set_option("display.max_rows", None)

In [None]:
# Creating categories from continous data

# histogram of approval percentage
plt.hist(target_df['gross_rental_yield'], bins=10, edgecolor='black')

# labels and title
plt.xlabel('Gross Rental Yield')
plt.ylabel('Frequency')
plt.title('Distribution of Gross Rental Yield')


plt.show()



In [None]:
# creating categories from continous data

# defining the category ranges and names
category_ranges = [0, 0.04, 0.06, 0.08, float('inf')]
category_names = ['yield_below_0.04', 'yield_0.04_0.06', 'yield_0.06_0.08', 'yield_above_0.08']

# creating a new categorical column based on the 'gross_rental_yield' column
target_df_copy = target_df.copy()
target_df_copy['yield_category'] = pd.cut(target_df_copy['gross_rental_yield'], bins=category_ranges, labels=category_names)


In [None]:
X = features_df  
y = target_df_copy['yield_category']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3)  
knn.fit(X_train_scaled, y_train)  

y_pred = knn.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

report = classification_report(y_test, y_pred, zero_division=1)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


In [None]:
car_commute

In [None]:
import matplotlib.pyplot as plt

# Plotting the histogram
plt.hist(target_df['car_commute'], bins=10, edgecolor='black')

# Adding labels and title
plt.xlabel('Car Commute')
plt.ylabel('Frequency')
plt.title('Distribution of Percentage Car Commuting')

# Display the histogram
plt.show()


In [None]:

# creating equal bins for categorical commuting data
num_bins = 4

# making equal size bin boundaries
bin_boundaries = np.linspace(target_df_copy['car_commute'].min(), target_df_copy['car_commute'].max(), num_bins + 1)

# defining the category names based on range
category_names = [f'commute_{bin_boundaries[i]:.2f}_{bin_boundaries[i+1]:.2f}' for i in range(num_bins)]

# creating a copy of the dataframe and assign category labels based on equal size bins
target_df_copy['car_commute_category'] = pd.cut(target_df_copy['car_commute'], bins=bin_boundaries, labels=category_names)


In [None]:
# filling in a missing value
target_df_copy['car_commute_category'] = target_df_copy['car_commute_category'].fillna('commute_0.22_0.42')


# I think below is what we want

In [None]:
# from sklearn.neighbors import NearestNeighbors
# from sklearn.preprocessing import StandardScaler

# X = features_df  # Your feature columns
# y = target_df_copy['car_commute_category']  # The new categorical target column

# # Assuming you have a single target row for which you want to find similar rows
# target_row = X.iloc[0].values  # Assuming the first row as the target and extracting the values

# # Scale the data
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Calculate distances to find similar rows
# neigh = NearestNeighbors(n_neighbors=5, metric='euclidean')  # Choose the value of K and distance metric
# neigh.fit(X_scaled)
# distances, indices = neigh.kneighbors(scaler.transform([target_row]))  # Pass target_row as a list

# similar_rows = X.iloc[indices[0]]

# print("Target Row:")
# print(target_row)
# print("\nSimilar Rows:")
# print(similar_rows)



In [None]:
target_row

In [None]:
X = features_df  # Your feature columns
y = target_df_copy['car_commute_category']  # The new categorical target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3)  # Create an instance of KNeighborsClassifier
knn.fit(X_train_scaled, y_train)  # Fit the model to the training data

y_pred = knn.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

In [None]:
# creating histogram
plt.hist(target_df['percent_more_200k'], bins=10, edgecolor='black')

# labels and title
plt.xlabel('percent_more_200k')
plt.ylabel('Frequency')
plt.title('Distribution of percent_more_200k')

plt.show()

In [None]:


# category ranges and names
category_ranges = [0.0, 0.0911, 0.211, 1]
category_names = ['below_.09', 'between.09_.2',  'above_0.2']

# creating a new categorical column based on the 'gross_rental_yield' column
target_df_copy = target_df.copy()
target_df_copy['percent_more_200k_category'] = pd.cut(target_df_copy['percent_more_200k'], bins=category_ranges, labels=category_names)


In [None]:
# fillling in 3 missing values
target_df_copy['percent_more_200k_category'] = target_df_copy['percent_more_200k_category'].fillna('below_.09')


In [None]:
X = features_df  
y = target_df_copy['percent_more_200k_category']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3) 
knn.fit(X_train_scaled, y_train)  

y_pred = knn.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=1)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

In [None]:
target_df_copy.isna().sum()