In [22]:
import boto3
import pandas as pd
import numpy as np
from io import StringIO
import json

Loading the CSV's from AWS

In [2]:
# Creating an S3 client object
s3 = boto3.client('s3')

# Specifying the name of the bucket
bucket_name = 'capstonehaystacks'

# List of CSV files to download
csv_files = [
    'atlanta_cbsa_zip.csv',
    'core_geo_dataset.csv',
    'crime_rating_zipcode.csv',
    'elementary_schools.csv',
    'GA_LISTINGS_SALES_V2.csv',
    'GA_LISTINGS_SALES.csv',
    'high_schools.csv',
    'middle_schools.csv'
]

# Dictionary to store the dataframes
dataframes = {}

# Downloading the files from S3 and reading them into pandas dataframes
for file_name in csv_files:
    s3.download_file(bucket_name, file_name, file_name)
    dataframes[file_name[:-4]] = pd.read_csv(file_name, index_col=False)

# Access the dataframes using their respective keys
atlanta_cbsa_zip_df = dataframes['atlanta_cbsa_zip']
core_geo_dataset_df = dataframes['core_geo_dataset']
crime_rating_zipcode_df = dataframes['crime_rating_zipcode']
elementary_schools_df = dataframes['elementary_schools']
GA_LISTINGS_SALES_V2_df = dataframes['GA_LISTINGS_SALES_V2']
GA_LISTINGS_SALES_df = dataframes['GA_LISTINGS_SALES']
high_schools_df = dataframes['high_schools']
middle_schools_df = dataframes['middle_schools']



Index_col=False didnt work so Im going to drop the first column for each data frame

In [4]:
# Looping through the keys in the dataframes dictionary and drop the 'Unnamed: 0' column
for key in dataframes.keys():
    dataframes[key].drop(columns=['Unnamed: 0'], inplace=True)


Looking into the GA_listings_sales files to find the difference between them

In [10]:
GA_LISTINGS_SALES_V2_df['zip'].value_counts()

30512    775
30525    534
30523    469
30546    451
30143    434
        ... 
39813      1
39836      1
31783      1
31563      1
30250      1
Name: zip, Length: 618, dtype: int64

In [11]:
GA_LISTINGS_SALES_df['zip'].value_counts()

30512    775
30525    534
30523    469
30546    451
30143    434
        ... 
39813      1
39836      1
31783      1
31563      1
30250      1
Name: zip, Length: 618, dtype: int64

In [12]:
GA_LISTINGS_SALES_df.shape

(31064, 20)

In [13]:
GA_LISTINGS_SALES_V2_df.shape

(31064, 21)

In [15]:
GA_LISTINGS_SALES_df.columns

Index(['latitude', 'longitude', 'city', 'county_name', 'beds', 'baths_full',
       'baths_half', 'square_footage', 'lot_size', 'year_built', 'details',
       'special_features', 'unit_count', 'price', 'transaction_type',
       'listing_status', 'listing_special_features', 'census_state_name',
       'census_county_name', 'zip'],
      dtype='object')

In [14]:
GA_LISTINGS_SALES_V2_df.columns

Index(['latitude', 'longitude', 'full_street_address', 'city', 'county_name',
       'beds', 'baths_full', 'baths_half', 'square_footage', 'lot_size',
       'year_built', 'details', 'special_features', 'unit_count', 'price',
       'transaction_type', 'listing_status', 'listing_special_features',
       'census_state_name', 'census_county_name', 'zip'],
      dtype='object')

It looks like the difference is V2 has a full_street_address column.  Going forward I will work with V2

Next I'm going to look into the two json files to figure out how to properly convert them into a dataframe

In [18]:
# List of JSON files to load
json_files = [
    'all_zips_restaurant.json',
    'all_zips_grocery_store.json'
]

# Dictionary to store the dataframes
dataframes_json = {}

# Downloading the files from S3 and reading them into pandas dataframes
for file_name in json_files:
    obj = s3.get_object(Bucket=bucket_name, Key=file_name)
    body = obj['Body'].read().decode('utf-8')
    dataframes_json[file_name[:-5]] = pd.read_json(StringIO(body))

# Access the dataframes using their respective keys
restaurant_df = dataframes_json['all_zips_restaurant']
grocery_store_df = dataframes_json['all_zips_grocery_store']

In [19]:
restaurant_df

Unnamed: 0,index,census_zcta_geoid,params,responce
0,0,47236,"{'location': '39.1517426,-85.7252769', 'radius...","{'html_attributions': [], 'results': [], 'stat..."
1,1,47870,"{'location': '39.3701518,-87.4735141', 'radius...","{'html_attributions': [], 'results': [], 'stat..."
2,2,47851,"{'location': '39.5735818,-87.2459626', 'radius...","{'html_attributions': [], 'results': [], 'stat..."
3,3,47337,"{'location': '39.8027537,-85.437285', 'radius'...","{'html_attributions': [], 'results': [{'busine..."
4,4,47435,"{'location': '39.2657557,-86.2951577', 'radius...","{'html_attributions': [], 'results': [{'busine..."
...,...,...,...,...
33786,33786,37932,"{'location': '35.9172993,-84.1987873', 'radius...",{'error_message': 'You have exceeded your rate...
33787,33787,37341,"{'location': '35.2199309,-85.0730025', 'radius...",{'error_message': 'You have exceeded your rate...
33788,33788,37849,"{'location': '36.0540502,-84.0484876', 'radius...","{'html_attributions': [], 'next_page_token': '..."
33789,33789,37754,"{'location': '36.1390993,-84.0298007', 'radius...","{'html_attributions': [], 'next_page_token': '..."


In [20]:
grocery_store_df

Unnamed: 0,index,census_zcta_geoid,params,responce
0,0,47236,"{'location': '39.1517426,-85.7252769', 'radius...","{'html_attributions': [], 'next_page_token': '..."
1,1,47870,"{'location': '39.3701518,-87.4735141', 'radius...","{'html_attributions': [], 'next_page_token': '..."
2,2,47851,"{'location': '39.5735818,-87.2459626', 'radius...","{'html_attributions': [], 'next_page_token': '..."
3,3,47337,"{'location': '39.8027537,-85.437285', 'radius'...","{'html_attributions': [], 'next_page_token': '..."
4,4,47435,"{'location': '39.2657557,-86.2951577', 'radius...","{'html_attributions': [], 'next_page_token': '..."
...,...,...,...,...
33786,33786,37932,"{'location': '35.9172993,-84.1987873', 'radius...","{'html_attributions': [], 'next_page_token': '..."
33787,33787,37341,"{'location': '35.2199309,-85.0730025', 'radius...","{'html_attributions': [], 'next_page_token': '..."
33788,33788,37849,"{'location': '36.0540502,-84.0484876', 'radius...","{'html_attributions': [], 'next_page_token': '..."
33789,33789,37754,"{'location': '36.1390993,-84.0298007', 'radius...","{'html_attributions': [], 'next_page_token': '..."


Json files are nested and arent easily converted to a dataframe.  

In [24]:
# List of JSON files to load
json_files = [
    'all_zips_restaurant.json',
    'all_zips_grocery_store.json'
]

# Downloading the files from S3 and printing a few records
for file_name in json_files:
    obj = s3.get_object(Bucket=bucket_name, Key=file_name)
    body = obj['Body'].read().decode('utf-8')
    json_data = json.loads(body)
    
    print(f'{file_name}:')
    for record in json_data:
        print(f'ZIP Code: {record["index"]}')
        print(f'Params: {record["params"]}')
        if 'response' in record:
            print(f'Response: {record["response"]}')
        if record['index'] == 5:
            break
    print('\n')


all_zips_restaurant.json:
ZIP Code: 0
Params: {'location': '39.1517426,-85.7252769', 'radius': 6000, 'type': 'restaurant', 'key': 'AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk'}
ZIP Code: 1
Params: {'location': '39.3701518,-87.4735141', 'radius': 6000, 'type': 'restaurant', 'key': 'AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk'}
ZIP Code: 2
Params: {'location': '39.5735818,-87.2459626', 'radius': 6000, 'type': 'restaurant', 'key': 'AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk'}
ZIP Code: 3
Params: {'location': '39.8027537,-85.437285', 'radius': 6000, 'type': 'restaurant', 'key': 'AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk'}
ZIP Code: 4
Params: {'location': '39.2657557,-86.2951577', 'radius': 6000, 'type': 'restaurant', 'key': 'AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk'}
ZIP Code: 5
Params: {'location': '38.1486002,-86.5940164', 'radius': 6000, 'type': 'restaurant', 'key': 'AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk'}


all_zips_grocery_store.json:
ZIP Code: 0
Params: {'location': '39.1517426,-85.7252769', 'ra

From the above it looks like I can use normalize to unpack the json file correctly

In [26]:
# Name of the JSON file to load
json_file = 'all_zips_restaurant.json'

# Downloading the file from S3 and converting to a dataframe
obj = s3.get_object(Bucket=bucket_name, Key=json_file)
body = obj['Body'].read().decode('utf-8')
json_data = json.loads(body)

restaurant_df = pd.json_normalize(json_data)


In [27]:
restaurant_df

Unnamed: 0,index,census_zcta_geoid,params.location,params.radius,params.type,params.key,responce.html_attributions,responce.results,responce.status,responce.next_page_token,responce.error_message
0,0,47236,"39.1517426,-85.7252769",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],[],ZERO_RESULTS,,
1,1,47870,"39.3701518,-87.4735141",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],[],ZERO_RESULTS,,
2,2,47851,"39.5735818,-87.2459626",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],[],ZERO_RESULTS,,
3,3,47337,"39.8027537,-85.437285",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],"[{'business_status': 'OPERATIONAL', 'geometry'...",OK,,
4,4,47435,"39.2657557,-86.2951577",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],"[{'business_status': 'OPERATIONAL', 'geometry'...",OK,,
...,...,...,...,...,...,...,...,...,...,...,...
33786,33786,37932,"35.9172993,-84.1987873",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],[],OVER_QUERY_LIMIT,,You have exceeded your rate-limit for this API.
33787,33787,37341,"35.2199309,-85.0730025",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],[],OVER_QUERY_LIMIT,,You have exceeded your rate-limit for this API.
33788,33788,37849,"36.0540502,-84.0484876",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],"[{'business_status': 'OPERATIONAL', 'geometry'...",OK,Aap_uEAcFc1hLOV2IfMHu_-_s2mUDoivHxNOlSgI4Sszwf...,
33789,33789,37754,"36.1390993,-84.0298007",6000,restaurant,AIzaSyB3moLzJU3JasI6-tt9ndImfAfFsIFwhHk,[],"[{'business_status': 'OPERATIONAL', 'geometry'...",OK,Aap_uEBnBxPkXgC-4aIHNll8VZXRiHu1YtrP391AKi5Qn0...,


In [28]:
# Name of the JSON file to load
json_file = 'all_zips_grocery_store.json'

# Downloading the file from S3 and converting to a dataframe
obj = s3.get_object(Bucket=bucket_name, Key=json_file)
body = obj['Body'].read().decode('utf-8')
json_data = json.loads(body)

grocery_df = pd.json_normalize(json_data)


In [30]:
grocery_df['params.type'].value_counts()

grocery_store    33791
Name: params.type, dtype: int64

In [31]:
restaurant_df['params.type'].value_counts()

restaurant    33791
Name: params.type, dtype: int64

In [32]:
grocery_df.shape

(33791, 11)

In [33]:
restaurant_df.shape

(33791, 11)

It looks like both of these json files are the same except one is says restaurants and one says gorcery_store

Original JSON files are each different sizes so I most likely loaded them wrong