In [1]:
import boto3
import pandas as pd
import numpy as np
from io import StringIO
import json


Loading the CSV's from AWS

In [3]:
# Creating an S3 client object
s3 = boto3.client('s3')

# Specifying the name of the bucket
bucket_name = 'capstonehaystacks'

# List of CSV files to download
csv_files = [
    'atlanta_cbsa_zip.csv',
    'core_geo_dataset.csv',
    'crime_rating_zipcode.csv',
    'elementary_schools.csv',
    'GA_LISTINGS_SALES_V2.csv',
    'GA_LISTINGS_SALES.csv',
    'high_schools.csv',
    'middle_schools.csv'
]

# Dictionary to store the dataframes
dataframes = {}

# Downloading the files from S3 and reading them into pandas dataframes
for file_name in csv_files:
    s3.download_file(bucket_name, file_name, file_name)
    dataframes[file_name[:-4]] = pd.read_csv(file_name, index_col=False)

# Access the dataframes using their respective keys
atlanta_cbsa_zip_df = dataframes['atlanta_cbsa_zip']
core_geo_dataset_df = dataframes['core_geo_dataset']
crime_rating_zipcode_df = dataframes['crime_rating_zipcode']
elementary_schools_df = dataframes['elementary_schools']
GA_LISTINGS_SALES_V2_df = dataframes['GA_LISTINGS_SALES_V2']
GA_LISTINGS_SALES_df = dataframes['GA_LISTINGS_SALES']
high_schools_df = dataframes['high_schools']
middle_schools_df = dataframes['middle_schools']



In [4]:
# Looping through the keys in the dataframes dictionary and drop the 'Unnamed: 0' column
for key in dataframes.keys():
    dataframes[key].drop(columns=['Unnamed: 0'], inplace=True)


In [5]:
print("Columns in atlanta_cbsa_zip_df:")
print(atlanta_cbsa_zip_df.columns)

print("\nColumns in core_geo_dataset_df:")
print(core_geo_dataset_df.columns)

print("\nColumns in crime_rating_zipcode_df:")
print(crime_rating_zipcode_df.columns)

print("\nColumns in elementary_schools_df:")
print(elementary_schools_df.columns)

print("\nColumns in GA_LISTINGS_SALES_V2_df:")
print(GA_LISTINGS_SALES_V2_df.columns)

print("\nColumns in GA_LISTINGS_SALES_df:")
print(GA_LISTINGS_SALES_df.columns)

print("\nColumns in high_schools_df:")
print(high_schools_df.columns)

print("\nColumns in middle_schools_df:")
print(middle_schools_df.columns)


Columns in atlanta_cbsa_zip_df:
Index(['census_cbsa_geoid', 'census_cbsa_name', 'census_cbsa_lsad',
       'census_cbsa_lat', 'census_cbsa_lon', 'census_zcta5_geoid',
       'census_zcta5_lat', 'census_zcta5_lon'],
      dtype='object')

Columns in core_geo_dataset_df:
Index(['census_zcta5_geoid', 'census_zcta5_lat', 'census_zcta5_lon',
       'census_tract_geoid', 'census_tract_lat', 'census_tract_lon'],
      dtype='object')

Columns in crime_rating_zipcode_df:
Index(['census_state_abbr', 'census_zcta5_geoid', 'census_cbsa_geoid_count',
       'census_cbsa_geoid_list', 'overall_crime_grade', 'violent_crime_grade',
       'violent_crime_total_rate', 'violent_crime_assault_rate',
       'violent_crime_robbery_rate', 'violent_crime_rape_rate',
       'violent_crime_murder_rate', 'property_crime_grade',
       'property_crime_total_rate', 'property_crime_theft_rate',
       'property_crime_vehicle_theft_rate', 'property_crime_burglary_rate',
       'property_crime_arson_rate', 'other_cri

Index_col=False didnt work so Im going to drop the first column for each data frame

Looking into the GA_listings_sales files to find the difference between them

In [6]:
GA_LISTINGS_SALES_df.shape

(31064, 20)

In [7]:
GA_LISTINGS_SALES_V2_df.shape

(31064, 21)

In [8]:
GA_LISTINGS_SALES_df.columns

Index(['latitude', 'longitude', 'city', 'county_name', 'beds', 'baths_full',
       'baths_half', 'square_footage', 'lot_size', 'year_built', 'details',
       'special_features', 'unit_count', 'price', 'transaction_type',
       'listing_status', 'listing_special_features', 'census_state_name',
       'census_county_name', 'zip'],
      dtype='object')

In [9]:
GA_LISTINGS_SALES_V2_df.columns

Index(['latitude', 'longitude', 'full_street_address', 'city', 'county_name',
       'beds', 'baths_full', 'baths_half', 'square_footage', 'lot_size',
       'year_built', 'details', 'special_features', 'unit_count', 'price',
       'transaction_type', 'listing_status', 'listing_special_features',
       'census_state_name', 'census_county_name', 'zip'],
      dtype='object')

It looks like the difference is V2 has a full_street_address column.  Going forward I will work with V2

Next I'm going to look into the two json files to figure out how to properly convert them into a dataframe

In [15]:
# JSON files to load
json_files = [
    'all_zips_restaurant.json',
    'all_zips_grocery_store.json'
]

# Dictionary to store the dataframes
dataframes_json = {}

# Downloading the files from S3 and reading them into pandas dataframes
for file_name in json_files:
    obj = s3.get_object(Bucket=bucket_name, Key=file_name)
    body = obj['Body'].read().decode('utf-8')
    dataframes_json[file_name[:-5]] = pd.read_json(StringIO(body))

# Access the dataframes using their respective keys
restaurant_df = dataframes_json['all_zips_restaurant']
grocery_store_df = dataframes_json['all_zips_grocery_store']

In [16]:
restaurant_df

Unnamed: 0,index,census_zcta_geoid,params,responce
0,0,47236,"{'location': '39.1517426,-85.7252769', 'radius...","{'html_attributions': [], 'results': [], 'stat..."
1,1,47870,"{'location': '39.3701518,-87.4735141', 'radius...","{'html_attributions': [], 'results': [], 'stat..."
2,2,47851,"{'location': '39.5735818,-87.2459626', 'radius...","{'html_attributions': [], 'results': [], 'stat..."
3,3,47337,"{'location': '39.8027537,-85.437285', 'radius'...","{'html_attributions': [], 'results': [{'busine..."
4,4,47435,"{'location': '39.2657557,-86.2951577', 'radius...","{'html_attributions': [], 'results': [{'busine..."
...,...,...,...,...
33786,33786,37932,"{'location': '35.9172993,-84.1987873', 'radius...",{'error_message': 'You have exceeded your rate...
33787,33787,37341,"{'location': '35.2199309,-85.0730025', 'radius...",{'error_message': 'You have exceeded your rate...
33788,33788,37849,"{'location': '36.0540502,-84.0484876', 'radius...","{'html_attributions': [], 'next_page_token': '..."
33789,33789,37754,"{'location': '36.1390993,-84.0298007', 'radius...","{'html_attributions': [], 'next_page_token': '..."


In [39]:
grocery_store_df

Unnamed: 0,index,census_zcta_geoid,params,responce
0,0,47236,"{'location': '39.1517426,-85.7252769', 'radius...","{'html_attributions': [], 'next_page_token': '..."
1,1,47870,"{'location': '39.3701518,-87.4735141', 'radius...","{'html_attributions': [], 'next_page_token': '..."
2,2,47851,"{'location': '39.5735818,-87.2459626', 'radius...","{'html_attributions': [], 'next_page_token': '..."
3,3,47337,"{'location': '39.8027537,-85.437285', 'radius'...","{'html_attributions': [], 'next_page_token': '..."
4,4,47435,"{'location': '39.2657557,-86.2951577', 'radius...","{'html_attributions': [], 'next_page_token': '..."
...,...,...,...,...
33786,33786,37932,"{'location': '35.9172993,-84.1987873', 'radius...","{'html_attributions': [], 'next_page_token': '..."
33787,33787,37341,"{'location': '35.2199309,-85.0730025', 'radius...","{'html_attributions': [], 'next_page_token': '..."
33788,33788,37849,"{'location': '36.0540502,-84.0484876', 'radius...","{'html_attributions': [], 'next_page_token': '..."
33789,33789,37754,"{'location': '36.1390993,-84.0298007', 'radius...","{'html_attributions': [], 'next_page_token': '..."


Json files are nested and arent easily converted to a dataframe.  

In [38]:
# # THE CODE BELOW EXPOSES API KEYS SO IT IS COMMENTED OUT TO AVOID UPLOADING TO GITHUB
# json_files = [
#     'all_zips_restaurant.json',
#     'all_zips_grocery_store.json'
# ]

# # Downloading the files from S3 and printing a few records
# for file_name in json_files:
#     obj = s3.get_object(Bucket=bucket_name, Key=file_name)
#     body = obj['Body'].read().decode('utf-8')
#     json_data = json.loads(body)
    
#     print(f'{file_name}:')
#     for record in json_data:
#         print(f'ZIP Code: {record["index"]}')
#         print(f'Params: {record["params"]}')
#         if 'response' in record:
#             print(f'Response: {record["response"]}')
#         if record['index'] == 5:
#             break
#     print('\n')


From the above it looks like I can use normalize to unpack the json file correctly

In [19]:
# Name of the JSON file to load
json_file = 'all_zips_restaurant.json'

# Downloading the file from S3 and converting to a dataframe
obj = s3.get_object(Bucket=bucket_name, Key=json_file)
body = obj['Body'].read().decode('utf-8')
json_data = json.loads(body)

restaurant_df = pd.json_normalize(json_data)


In [21]:
# Name of the JSON file to load
json_file = 'all_zips_grocery_store.json'

# Downloading the file from S3 and converting to a dataframe
obj = s3.get_object(Bucket=bucket_name, Key=json_file)
body = obj['Body'].read().decode('utf-8')
json_data = json.loads(body)

grocery_df = pd.json_normalize(json_data)


In [22]:
grocery_df['params.type'].value_counts()

grocery_store    33791
Name: params.type, dtype: int64

In [23]:
restaurant_df['params.type'].value_counts()

restaurant    33791
Name: params.type, dtype: int64

In [24]:
grocery_df.shape

(33791, 11)

In [25]:
restaurant_df.shape

(33791, 11)

It looks like both of these json files are the same except one is says restaurants and one says gorcery_store

Original JSON files are each different sizes so I most likely loaded them wrong

### Removing data that represents property outside of Atlanta

In [26]:
# This columns contains only Atlanta zip codes
atlanta_cbsa_zip_df['census_zcta5_geoid']

0      30309
1      30517
2      30548
3      30518
4      30334
       ...  
236    30075
237    30272
238    30315
239    30316
240    30519
Name: census_zcta5_geoid, Length: 241, dtype: int64

In [27]:
atlanta_cbsa_zip_df.dtypes

census_cbsa_geoid       int64
census_cbsa_name       object
census_cbsa_lsad       object
census_cbsa_lat       float64
census_cbsa_lon       float64
census_zcta5_geoid      int64
census_zcta5_lat      float64
census_zcta5_lon      float64
dtype: object

In [28]:
# Zip column is currently a string value
GA_LISTINGS_SALES_V2_df.dtypes

latitude                    float64
longitude                   float64
full_street_address          object
city                         object
county_name                  object
beds                        float64
baths_full                  float64
baths_half                  float64
square_footage              float64
lot_size                    float64
year_built                  float64
details                      object
special_features              int64
unit_count                  float64
price                         int64
transaction_type              int64
listing_status                int64
listing_special_features      int64
census_state_name            object
census_county_name           object
zip                          object
dtype: object

In [44]:
# Converting the zip column to a numeric type and dropping any non-numeric or missing values
GA_LISTINGS_SALES_V2_df['zip'] = pd.to_numeric(GA_LISTINGS_SALES_V2_df['zip'], errors='coerce')
GA_LISTINGS_SALES_V2_df = GA_LISTINGS_SALES_V2_df.dropna(subset=['zip'])

# Converting the zip column to an integer
GA_LISTINGS_SALES_V2_df.loc[:, 'zip'] = GA_LISTINGS_SALES_V2_df['zip'].astype(int)

# Getting a list of all the census_zcta5_geoid values which are the zip codes in Atlanta
census_zcta5_geoids = atlanta_cbsa_zip_df['census_zcta5_geoid'].tolist()

# Filtering GA_LISTINGS_SALES_V2_df to keep only the rows with Atlanta zip codes
GA_LISTINGS_SALES_V2_df = GA_LISTINGS_SALES_V2_df[GA_LISTINGS_SALES_V2_df['zip'].isin(census_zcta5_geoids)]


In [30]:
GA_LISTINGS_SALES_V2_df

Unnamed: 0,latitude,longitude,full_street_address,city,county_name,beds,baths_full,baths_half,square_footage,lot_size,...,details,special_features,unit_count,price,transaction_type,listing_status,listing_special_features,census_state_name,census_county_name,zip
721,33.971863,-83.618736,633 Marigot Way,Statham,Barrow-County,3.0,2.0,,1634.0,1.0,...,"Detached, 3 Beds, 2 Baths, 1,634 Sq Ft",0,,305000,1,1,0,Georgia,"Barrow, GA",30666
722,34.016819,-83.572884,514 Jones Road,Statham,Barrow-County,,,,,38.0,...,"Lots/Land, 38.26 Acres",0,,765200,1,1,0,Georgia,"Barrow, GA",30666
723,34.018024,-83.663994,431 Dunahoo Road,Winder,Barrow-County,3.0,2.0,,1625.0,0.0,...,"Detached, 3 Beds, 2 Baths, 1,625 Sq Ft",0,,349900,1,1,0,Georgia,"Barrow, GA",30680
724,34.003227,-83.749001,471 Monticello Court,Winder,Barrow-County,4.0,3.0,,1816.0,0.0,...,"Detached, 4 Beds, 3 Baths, 1,816 Sq Ft",0,,289900,1,1,0,Georgia,"Barrow, GA",30680
725,33.945867,-83.777162,2025 Jessica Drive,Winder,Barrow-County,3.0,2.0,,1332.0,0.0,...,"Detached, 3 Beds, 2 Baths, 1,332 Sq Ft",0,,310500,1,1,0,Georgia,"Barrow, GA",30680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27610,34.867450,-84.328945,0 Lt 29 the Summit @ Ellijay #LOT 29 313742,Ellijay,Fannin-County,4.0,,,2848.0,0.0,...,"Detached, 4 Beds, 2,848 Sq Ft, 0.25 Acres",2,,699900,1,1,2,Georgia,"Fannin, GA",30540
29828,34.503000,-83.092269,0 Oak Place #20021896,Lavonia,Franklin-County,,,,,0.0,...,"Lots/Land, 0.86 Acres",0,,14500,1,1,0,Georgia,"Franklin, GA",30533
29829,34.503000,-83.092269,0 Oak Place #7008510,Lavonia,Franklin-County,,,,,0.0,...,"Lots/Land, 0.98 Acres",0,,14500,1,1,0,Georgia,"Franklin, GA",30533
29927,34.525218,-83.473638,755 Signature Drive,Mount Airy,Habersham-County,3.0,3.0,,1350.0,0.0,...,"Detached, 3 Beds, 3 Baths, 1,350 Sq Ft",2,,309900,1,1,2,Georgia,"Habersham, GA",30620


In [31]:
GA_LISTINGS_SALES_V2_df.isna().sum()


latitude                        0
longitude                       0
full_street_address             0
city                            0
county_name                     0
beds                         7764
baths_full                   7793
baths_half                  12875
square_footage               8690
lot_size                      520
year_built                   6278
details                         0
special_features                0
unit_count                  16971
price                           0
transaction_type                0
listing_status                  0
listing_special_features        0
census_state_name               0
census_county_name              0
zip                             0
dtype: int64

In [32]:
# How many rows are vacant land?
GA_LISTINGS_SALES_V2_df['details'].str.contains('Lots/Land').sum()

5890

In [33]:
# Dropping these since they are of no value for our purpose
GA_LISTINGS_SALES_V2_df = GA_LISTINGS_SALES_V2_df[~GA_LISTINGS_SALES_V2_df['details'].str.contains('Lots/Land')]


In [34]:
GA_LISTINGS_SALES_V2_df.isna().sum()


latitude                        0
longitude                       0
full_street_address             0
city                            0
county_name                     0
beds                         2054
baths_full                   2082
baths_half                   7020
square_footage               2819
lot_size                      444
year_built                    760
details                         0
special_features                0
unit_count                  11081
price                           0
transaction_type                0
listing_status                  0
listing_special_features        0
census_state_name               0
census_county_name              0
zip                             0
dtype: int64

In [35]:
GA_LISTINGS_SALES_V2_df

Unnamed: 0,latitude,longitude,full_street_address,city,county_name,beds,baths_full,baths_half,square_footage,lot_size,...,details,special_features,unit_count,price,transaction_type,listing_status,listing_special_features,census_state_name,census_county_name,zip
721,33.971863,-83.618736,633 Marigot Way,Statham,Barrow-County,3.0,2.0,,1634.0,1.0,...,"Detached, 3 Beds, 2 Baths, 1,634 Sq Ft",0,,305000,1,1,0,Georgia,"Barrow, GA",30666
723,34.018024,-83.663994,431 Dunahoo Road,Winder,Barrow-County,3.0,2.0,,1625.0,0.0,...,"Detached, 3 Beds, 2 Baths, 1,625 Sq Ft",0,,349900,1,1,0,Georgia,"Barrow, GA",30680
724,34.003227,-83.749001,471 Monticello Court,Winder,Barrow-County,4.0,3.0,,1816.0,0.0,...,"Detached, 4 Beds, 3 Baths, 1,816 Sq Ft",0,,289900,1,1,0,Georgia,"Barrow, GA",30680
725,33.945867,-83.777162,2025 Jessica Drive,Winder,Barrow-County,3.0,2.0,,1332.0,0.0,...,"Detached, 3 Beds, 2 Baths, 1,332 Sq Ft",0,,310500,1,1,0,Georgia,"Barrow, GA",30680
726,33.995583,-83.704200,286 Martin Luther King jr Drive,Winder,Barrow-County,3.0,2.0,,1060.0,0.0,...,"Detached, 3 Beds, 2 Baths, 1,060 Sq Ft",0,,240000,1,1,0,Georgia,"Barrow, GA",30680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26930,33.854188,-83.708172,0 N Broad Street #1&2 8323742,Monroe,Walton-County,,,,,2.0,...,"Other, 2.56 Acres",0,,135000,1,1,0,Georgia,"Walton, GA",30656
26931,33.793375,-83.674860,0 Us Hwy 78 NW #8271987,Monroe,Walton-County,,,,,119.0,...,"Commercial, 119.72 Acres",0,,1975380,1,1,0,Georgia,"Walton, GA",30655
27610,34.867450,-84.328945,0 Lt 29 the Summit @ Ellijay #LOT 29 313742,Ellijay,Fannin-County,4.0,,,2848.0,0.0,...,"Detached, 4 Beds, 2,848 Sq Ft, 0.25 Acres",2,,699900,1,1,2,Georgia,"Fannin, GA",30540
29927,34.525218,-83.473638,755 Signature Drive,Mount Airy,Habersham-County,3.0,3.0,,1350.0,0.0,...,"Detached, 3 Beds, 3 Baths, 1,350 Sq Ft",2,,309900,1,1,2,Georgia,"Habersham, GA",30620


In [36]:
census_zcta5_geoids

[30309,
 30517,
 30548,
 30518,
 30334,
 30331,
 30176,
 30014,
 30319,
 30153,
 30349,
 30337,
 30342,
 30258,
 30257,
 30338,
 30285,
 30092,
 30178,
 30291,
 30268,
 30038,
 30083,
 30087,
 30084,
 30144,
 30058,
 30008,
 30157,
 30274,
 30168,
 30126,
 30666,
 30329,
 30360,
 30004,
 30680,
 30097,
 30024,
 30101,
 30034,
 30002,
 30322,
 30288,
 30021,
 30035,
 30260,
 30273,
 30106,
 30534,
 30111,
 30122,
 30046,
 30078,
 30234,
 30177,
 30655,
 30016,
 30040,
 30220,
 30277,
 30170,
 30289,
 30171,
 30238,
 30141,
 30017,
 30183,
 30139,
 31822,
 30070,
 30185,
 30259,
 30284,
 30041,
 30108,
 30064,
 30012,
 30080,
 30297,
 30152,
 30096,
 30281,
 30250,
 30536,
 30540,
 30071,
 30620,
 30120,
 30184,
 30028,
 30230,
 31816,
 30217,
 30240,
 30222,
 30103,
 30118,
 30263,
 30506,
 30303,
 30354,
 30363,
 30533,
 30308,
 30344,
 30113,
 30663,
 30025,
 30336,
 30326,
 30204,
 30104,
 30206,
 30179,
 30213,
 30076,
 30022,
 31016,
 30296,
 30622,
 30005,
 30032,
 30621,
 30641,
