# Data EDA

## Import Packages

In [16]:
import os
import boto3  # AWS
import pandas as pd
from pandas import json_normalize
import numpy as np
import json
import io

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)
import pandas as pd

# Set the maximum number of columns to 200
pd.set_option("display.max_columns", 200)



## Import Data from AWS Bucket

In [17]:
# Creating an S3 client object
s3 = boto3.client("s3")

# Specifying the name of the bucket
bucket_name = "capstonehaystacks"

# List of CSV files to download
files = [
    "atlanta_cbsa_zip.csv",
    "core_geo_dataset.csv",
    "crime_rating_zipcode.csv",
    "elementary_schools.csv",
    "GA_LISTINGS_SALES_V2.csv",
    "GA_LISTINGS_SALES.csv",
    "high_schools.csv",
    "middle_schools.csv",
    "all_zips_grocery_store.json",
    "all_zips_restaurant.json",
]

# Dictionary to store the dataframes
dataframes = {}

# Downloading the files from S3 and parsing them into data
for file_name in files:
    _, file_ext = os.path.splitext(file_name)
    if file_ext == ".csv":
        s3.download_file(bucket_name, file_name, file_name)
        dataframes[file_name[:-4]] = pd.read_csv(file_name, index_col=False)
    elif file_ext == ".json":
        response = s3.get_object(Bucket=bucket_name, Key=file_name)
        json_content = response["Body"].read().decode("utf-8")
        dataframes[file_name[:-5]] = json.loads(json_content)

In [18]:
# Access the dataframes using their respective keys
grocery = dataframes["all_zips_grocery_store"]
restaurant = dataframes["all_zips_restaurant"]
atlanta_geo = dataframes["atlanta_cbsa_zip"]
geo = dataframes["core_geo_dataset"]
crime = dataframes["crime_rating_zipcode"]
elem_school = dataframes["elementary_schools"]
high_school = dataframes["high_schools"]
mid_school = dataframes["middle_schools"]
GA_listing = dataframes["GA_LISTINGS_SALES_V2"]

## Data Cleanup

In [19]:
# Converting .csv's to DataFrames
data = [atlanta_geo, geo, crime, elem_school, high_school, mid_school, GA_listing]

for i in data:
    pd.DataFrame(i)

In [20]:
# Converting .csv's to DataFrames
data = [atlanta_geo, geo, crime, GA_listing]

for i in data:
    pd.DataFrame(i)

## Cleanup of hmda csv

In [21]:
file_name = "hmda_2021_ga_all-records_labels.csv"

# Downloading the file from S3 and parsing it into a dataframe
s3.download_file(bucket_name, file_name, file_name)
# Low memory is set to False because columns have mixed data types
hmda_2021 = pd.read_csv(file_name, index_col=False, low_memory=False)

Using the Atlanta area zip codes from the 'atlanta_geo' dataset, I am identifying the corresponding census tract numbers for the Atlanta region within the 'geo' dataset. With these Atlanta area census tract numbers, I am creating an 'hmda' dataframe that includes data exclusively for the Atlanta area.

In [22]:
# Creating a set of all the zip codes in the atlanta region
zipcode_set = set(atlanta_geo['census_zcta5_geoid'])

# Creating a dataframe with values only in the atlanta region
filtered_geo = geo[geo["census_zcta5_geoid"].isin(zipcode_set)]

# Using this atlanta dataframe to create a set of census tract ID's
census_tract_set = set(filtered_geo["census_tract_geoid"])

# Filter by census tract set to create hmda with just atlanta data
atlanta_hmda_2021 = hmda_2021[hmda_2021["census_tract"].isin(census_tract_set)]


In [23]:
# Confirm it worked
print(hmda_2021.shape)
print(atlanta_hmda_2021.shape)


(893559, 99)
(235504, 99)


In [24]:
# Dropping columns that arent useful for our purpose
atlanta_hmda_2021 = atlanta_hmda_2021.drop(columns=["debt_to_income_ratio", "income",
                                                    "multifamily_affordable_units", "property_value", 
                                                    "activity_year", "lei", "derived_msa-md", "state_code", 
                                                    "county_code", "derived_ethnicity", "derived_race", 
                                                    "preapproval", "total_loan_costs", "total_points_and_fees",
                                                    "origination_charges", "discount_points", "lender_credits",
                                                    "prepayment_penalty_term", "intro_rate_period", 
                                                    "balloon_payment", "applicant_ethnicity-1", 
                                                    "applicant_ethnicity-2", "applicant_ethnicity-3", 
                                                    "applicant_ethnicity-4", "applicant_ethnicity-5", 
                                                    "co-applicant_ethnicity-1", "co-applicant_ethnicity-2", 
                                                    "co-applicant_ethnicity-3", "co-applicant_ethnicity-4", 
                                                    "co-applicant_ethnicity-5", "applicant_ethnicity_observed", 
                                                    "co-applicant_ethnicity_observed", "applicant_race-1", 
                                                    "applicant_race-2", "applicant_race-3", "applicant_race-4", 
                                                    "applicant_race-5", "co-applicant_race-1", 
                                                    "co-applicant_race-2", "co-applicant_race-3", 
                                                    "co-applicant_race-4", "co-applicant_race-5", 
                                                    "applicant_race_observed", "co-applicant_race_observed", 
                                                    "applicant_age_above_62", "co-applicant_age_above_62", 
                                                    "submission_of_application", "initially_payable_to_institution", 
                                                    "aus-1", "aus-2", "aus-3", "aus-4", "aus-5", 
                                                    "tract_minority_population_percent", "denial_reason-2", 
                                                    "denial_reason-3", "denial_reason-4", "applicant_sex", 
                                                    "co-applicant_sex", "applicant_sex_observed", 
                                                    "co-applicant_sex_observed", "applicant_age", 
                                                    "co-applicant_age", "manufactured_home_secured_property_type",
                                                    "manufactured_home_land_property_interest", "loan_term", 
                                                    "negative_amortization", "interest_only_payment", 
                                                    "other_nonamortizing_features", "loan_to_value_ratio", 
                                                    "interest_rate", "rate_spread", "derived_sex", 
                                                    "conforming_loan_limit"])


In [25]:
# Using the zip codes found in filtered geo to add them to the observations in hdma dataset
merged_data = atlanta_hmda_2021.merge(filtered_geo[['census_tract_geoid', 'census_zcta5_geoid']],
                                      left_on='census_tract',
                                      right_on='census_tract_geoid',
                                      how='left')

# Renaming the 'census_zcta5_geoid' column to 'zip_code'
merged_data.rename(columns={'census_zcta5_geoid': 'zip_code'}, inplace=True)

# # Dropping the 'census_tract_geoid' column 
# merged_data.drop('census_tract_geoid', axis=1, inplace=True)


Removing observations that arent useful for our purpose and adding a column for whether a loan was approved

In [49]:
# Removing observations that had the application withdrawn or the application was incomplete
filtered_data = merged_data[(merged_data['action_taken'] != 4) & (merged_data['action_taken'] != 5)]

# Removing observations with a 4 or 5 in the loan_purpose column since this isnt defined in data dictionary
filtered_data = filtered_data[(filtered_data['loan_purpose'] != 4) & (filtered_data['loan_purpose'] != 5)]

# Filtering out observations that contained commercial loans
filtered_data = filtered_data[(filtered_data['business_or_commercial_purpose'] != 1) & (filtered_data['business_or_commercial_purpose'] != 1111)]

# Creating a new column called loan status to represent if a loan was approved or not
filtered_data['loan_status'] = filtered_data['action_taken'].apply(lambda x: 'approved' if x in [1, 2, 6, 8] else 'denied')



In [50]:
# Filtering rows with loan_purpose equal to 31, 1 which is home purpose and 2, or 32 which is home improvement
loan_purpose_filtered = filtered_data[filtered_data['loan_purpose'].isin([31, 1, 2, 32])]

Creating new features and aggregating by zip code

In [41]:

# Grouping the data to calculate count and average loan_amount
zip_code_data = loan_purpose_filtered.groupby(['zip_code', 'loan_purpose', 'loan_status', 'derived_loan_product_type']).agg(
    loan_count=pd.NamedAgg(column='loan_amount', aggfunc='count'),
    average_loan_amount=pd.NamedAgg(column='loan_amount', aggfunc='mean')
).reset_index()


In [42]:
# Creating a loan purpose category to define whether it is a home purchase or improvement
zip_code_data['loan_purpose_category'] = zip_code_data['loan_purpose'].apply(lambda x: 'home_purchase' if x in [1, 31] else 'home_improvement')


In [43]:
# Removing the derived loan product type column and aggregrating 
zip_code_summary = zip_code_data.groupby(['zip_code', 'loan_purpose_category', 'loan_status']).agg(
    loan_count=pd.NamedAgg(column='loan_count', aggfunc='sum'),
    average_loan_amount=pd.NamedAgg(column='average_loan_amount', aggfunc='mean')
).reset_index()


In [44]:
# Creating a pivot table to have one row per zip code
pivot_table = zip_code_summary.pivot_table(
    index='zip_code',
    columns=['loan_purpose_category', 'loan_status'],
    values=['loan_count', 'average_loan_amount'],
    fill_value=0
).reset_index()

# Flattening the pivot table
pivot_table.columns = ['_'.join(col).strip() for col in pivot_table.columns.values]


In [45]:
# Creating a new column with total loan count for each zip code
pivot_table["total_loan_count"] = (
    pivot_table["loan_count_home_improvement_approved"]
    + pivot_table["loan_count_home_improvement_denied"]
    + pivot_table["loan_count_home_purchase_approved"]
    + pivot_table["loan_count_home_purchase_denied"]
)

# Creating a new column with the total number of approved loans for each zip code
pivot_table["total_approved_loans"] = (
    pivot_table["loan_count_home_improvement_approved"]
    + pivot_table["loan_count_home_purchase_approved"]
)

# Creating a new column with the total number of denied loans for each zip code
pivot_table["total_denied_loans"] = (
    pivot_table["loan_count_home_improvement_denied"]
    + pivot_table["loan_count_home_purchase_denied"]
)
# Creating a new column with the approval percentage
pivot_table["approval_percentage"] = (
    (pivot_table["total_approved_loans"] / pivot_table["total_loan_count"]) * 100
).round(2)


In [140]:
# Rounding a few of the columns for easier readability
columns_to_round = [
    'average_loan_amount_home_improvement_approved',
    'average_loan_amount_home_improvement_denied',
    'average_loan_amount_home_purchase_approved',
    'average_loan_amount_home_purchase_denied'
]

for column in columns_to_round:
    pivot_table[column] = pivot_table[column].round().astype(int)


<IPython.core.display.Javascript object>

In [141]:
# Creating the tract_median_income column which shows the actual value out of the percentage column
filtered_data['tract_median_income'] = filtered_data['ffiec_msa_md_median_family_income'] * filtered_data['tract_to_msa_income_percentage'] * 0.01

# Grouping by zip_code to aggregrate tract data
grouped_data = filtered_data.groupby('zip_code').agg(
    zip_median_income=('tract_median_income', 'mean'),
    population=('tract_population', 'sum'),
    zip_owner_occupied_units=('tract_owner_occupied_units', 'sum'),
    total_one_to_four_family_homes=('tract_one_to_four_family_homes', 'sum'),
    median_age_of_housing_units=('tract_median_age_of_housing_units', 'mean')
)

# Reseting the index to make zip_code a regular column
census_data = grouped_data.reset_index()


<IPython.core.display.Javascript object>

In [142]:
# Rounding the a few of the columns for easier readability
columns_to_round = ["zip_median_income", "median_age_of_housing_units"]

for column in columns_to_round:
    census_data[column] = census_data[column].round().astype(int)

<IPython.core.display.Javascript object>

In [143]:
# Merging census_data and pivot_table on the zip_code column
hmda_2021 = pivot_table.merge(
    census_data, left_on="zip_code__", right_on="zip_code", how="left"
)

# Dropping the zip_code__ and zip_code columns since they're redundant
hmda_2021 = hmda_2021.drop(columns=["zip_code"])

# Renaming the 'zip_code__' column to 'zip_code'
hmda_2021 = hmda_2021.rename(columns={"zip_code__": "zip_code"})

<IPython.core.display.Javascript object>

Doing the same but this time aggregating on census tract instead of zip code

In [29]:
# Grouping the data to calculate count and average loan_amount
tract_data = loan_purpose_filtered.groupby(['census_tract_geoid', 'loan_purpose', 'loan_status', 'derived_loan_product_type']).agg(
    loan_count=pd.NamedAgg(column='loan_amount', aggfunc='count'),
    average_loan_amount=pd.NamedAgg(column='loan_amount', aggfunc='mean')
).reset_index()


In [33]:
# Creating a loan purpose category to define whether it is a home purchase or improvement
tract_data['loan_purpose_category'] = tract_data['loan_purpose'].apply(lambda x: 'home_purchase' if x in [1, 31] else 'home_improvement')


In [39]:
# Removing the derived loan product type column and aggregrating 

tract_summary = tract_data.groupby(['census_tract_geoid', 'loan_purpose_category', 'loan_status']).agg(
    loan_count=pd.NamedAgg(column='loan_count', aggfunc='sum'),
    average_loan_amount=pd.NamedAgg(column='average_loan_amount', aggfunc='mean')
).reset_index()


In [46]:
# Creating a pivot table to have one row per census tract
pivot_table = tract_summary.pivot_table(
    index='census_tract_geoid',
    columns=['loan_purpose_category', 'loan_status'],
    values=['loan_count', 'average_loan_amount'],
    fill_value=0
).reset_index()

# Flattening the pivot table
pivot_table.columns = ['_'.join(col).strip() for col in pivot_table.columns.values]


In [47]:
# Creating a new column with total loan count for each zip code
pivot_table["total_loan_count"] = (
    pivot_table["loan_count_home_improvement_approved"]
    + pivot_table["loan_count_home_improvement_denied"]
    + pivot_table["loan_count_home_purchase_approved"]
    + pivot_table["loan_count_home_purchase_denied"]
)

# Creating a new column with the total number of approved loans for each zip code
pivot_table["total_approved_loans"] = (
    pivot_table["loan_count_home_improvement_approved"]
    + pivot_table["loan_count_home_purchase_approved"]
)

# Creating a new column with the total number of denied loans for each zip code
pivot_table["total_denied_loans"] = (
    pivot_table["loan_count_home_improvement_denied"]
    + pivot_table["loan_count_home_purchase_denied"]
)
# Creating a new column with the approval percentage
pivot_table["approval_percentage"] = (
    (pivot_table["total_approved_loans"] / pivot_table["total_loan_count"]) * 100
).round(2)


In [48]:
# Rounding a few of the columns for easier readability
columns_to_round = [
    'average_loan_amount_home_improvement_approved',
    'average_loan_amount_home_improvement_denied',
    'average_loan_amount_home_purchase_approved',
    'average_loan_amount_home_purchase_denied'
]

for column in columns_to_round:
    pivot_table[column] = pivot_table[column].round().astype(int)

In [53]:
# creating a new column for tract_median_income
filtered_data['tract_median_income'] = filtered_data['ffiec_msa_md_median_family_income'] * filtered_data['tract_to_msa_income_percentage'] * 0.01

# selecting the columns you want to keep in the final DataFrame
census_data = filtered_data[['census_tract_geoid', 'tract_median_income', 'tract_population', 'tract_owner_occupied_units', 'tract_one_to_four_family_homes', 'tract_median_age_of_housing_units']]


# reseting the index to make census_tract_geoid a regular column
census_data = census_data.reset_index(drop=True)



In [56]:
# Rounding the a few of the columns for easier readability
columns_to_round = ["tract_median_income", "tract_median_age_of_housing_units"]

for column in columns_to_round:
    census_data[column] = census_data[column].round().astype(int)

In [59]:
# Merging census_data and pivot_table on the zip_code column
hmda_census_2021 = pivot_table.merge(
    census_data, left_on="census_tract_geoid__", right_on="census_tract_geoid", how="left"
)

In [62]:
# dropping duplicate rows
hmda_census_2021.drop_duplicates(inplace=True)

# resetting the index
hmda_census_2021.reset_index(drop=True, inplace=True)



In [67]:
hmda_census_2021

Unnamed: 0,census_tract_geoid__,average_loan_amount_home_improvement_approved,average_loan_amount_home_improvement_denied,average_loan_amount_home_purchase_approved,average_loan_amount_home_purchase_denied,loan_count_home_improvement_approved,loan_count_home_improvement_denied,loan_count_home_purchase_approved,loan_count_home_purchase_denied,total_loan_count,total_approved_loans,total_denied_loans,approval_percentage,census_tract_geoid,tract_median_income,tract_population,tract_owner_occupied_units,tract_one_to_four_family_homes,tract_median_age_of_housing_units
0,13013180103,168506,191898,275490,256960,496,124,1464,192,2276,1960,316,86.12,13013180103,113124,4674,1295,1651,13
1,13013180104,121120,147986,199265,171667,34,17,141,6,198,175,23,88.38,13013180104,77987,1976,562,785,27
2,13013180105,175597,206667,224304,395130,198,36,477,57,768,675,93,87.89,13013180105,80558,2802,753,900,20
3,13013180106,140800,136500,188729,223095,48,10,212,17,287,260,27,90.59,13013180106,83986,2831,680,1018,25
4,13013180107,138982,93667,214952,155694,96,22,278,50,446,374,72,83.86,13013180107,53134,3853,980,1620,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,13297110506,208822,160816,256893,293048,260,82,644,62,1048,904,144,86.26,13297110506,94270,5143,1577,1951,21
567,13297110508,187379,166644,233457,265542,184,50,520,54,808,704,104,87.13,13297110508,95984,5110,1411,1802,19
568,13297110601,202932,133774,234775,276276,408,84,942,105,1539,1350,189,87.72,13297110601,91699,4644,1509,1683,18
569,13297110602,186615,159209,283551,266724,846,126,2034,228,3234,2880,354,89.05,13297110602,100269,5325,1370,1847,19


In [65]:
hmda_census_2021.isna().sum()

census_tract_geoid__                             0
average_loan_amount_home_improvement_approved    0
average_loan_amount_home_improvement_denied      0
average_loan_amount_home_purchase_approved       0
average_loan_amount_home_purchase_denied         0
loan_count_home_improvement_approved             0
loan_count_home_improvement_denied               0
loan_count_home_purchase_approved                0
loan_count_home_purchase_denied                  0
total_loan_count                                 0
total_approved_loans                             0
total_denied_loans                               0
approval_percentage                              0
census_tract_geoid                               0
tract_median_income                              0
tract_population                                 0
tract_owner_occupied_units                       0
tract_one_to_four_family_homes                   0
tract_median_age_of_housing_units                0
dtype: int64

In [145]:
hmda_2021.isna().sum()

zip_code                                         0
average_loan_amount_home_improvement_approved    0
average_loan_amount_home_improvement_denied      0
average_loan_amount_home_purchase_approved       0
average_loan_amount_home_purchase_denied         0
loan_count_home_improvement_approved             0
loan_count_home_improvement_denied               0
loan_count_home_purchase_approved                0
loan_count_home_purchase_denied                  0
total_loan_count                                 0
total_approved_loans                             0
total_denied_loans                               0
approval_percentage                              0
zip_median_income                                0
population                                       0
zip_owner_occupied_units                         0
total_one_to_four_family_homes                   0
median_age_of_housing_units                      0
dtype: int64

<IPython.core.display.Javascript object>

In [148]:
hmda_2021.columns


Index(['zip_code', 'average_loan_amount_home_improvement_approved',
       'average_loan_amount_home_improvement_denied',
       'average_loan_amount_home_purchase_approved',
       'average_loan_amount_home_purchase_denied',
       'loan_count_home_improvement_approved',
       'loan_count_home_improvement_denied',
       'loan_count_home_purchase_approved', 'loan_count_home_purchase_denied',
       'total_loan_count', 'total_approved_loans', 'total_denied_loans',
       'approval_percentage', 'zip_median_income', 'population',
       'zip_owner_occupied_units', 'total_one_to_four_family_homes',
       'median_age_of_housing_units'],
      dtype='object')

<IPython.core.display.Javascript object>

Uploading to AWS

In [147]:
# Converting the DataFrame to a csv buffer in memory
csv_buffer = io.StringIO()
hmda_2021.to_csv(csv_buffer, index=False)

# Create an S3 resource
s3R = boto3.resource("s3")

# Naming the file for AWS storage
file_name = "atlanta_hdma_2021.csv"

# Assigning the output to a variable to limit what is printed
response = s3R.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue())

# Checking the status to confirm it was uploaded
print(f"File uploaded to S3 with ETag: {response['ETag']}")

File uploaded to S3 with ETag: "2dadf4c7f6ec74ce6e5de45ae28c884f"


<IPython.core.display.Javascript object>

In [66]:
# Converting the DataFrame to a csv buffer in memory
csv_buffer = io.StringIO()
hmda_census_2021.to_csv(csv_buffer, index=False)

# Create an S3 resource
s3R = boto3.resource("s3")

# Naming the file for AWS storage
file_name = "atlanta_hdma_census_2021.csv"

# Assigning the output to a variable to limit what is printed
response = s3R.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue())

# Checking the status to confirm it was uploaded
print(f"File uploaded to S3 with ETag: {response['ETag']}")

File uploaded to S3 with ETag: "9f59ddee230757eec57a54b7390412d8"


#### Combine schools into one dataset

In [19]:
# All the same Columns
print(elem_school.columns)
print(mid_school.columns)
print(high_school.columns)

Index(['Unnamed: 0', 'id', 'districtID', 'districtName', 'districtCity', 'lat',
       'long', 'name', 'gradeLevels', 'address', 'rating', 'ratingScale',
       'schoolType', 'zipcode', 'studentsPerTeacher', 'parentRating',
       'schoolLevel'],
      dtype='object')
Index(['Unnamed: 0', 'id', 'districtID', 'districtName', 'districtCity', 'lat',
       'long', 'name', 'gradeLevels', 'address', 'rating', 'ratingScale',
       'schoolType', 'zipcode', 'studentsPerTeacher', 'parentRating',
       'schoolLevel'],
      dtype='object')
Index(['Unnamed: 0', 'id', 'districtID', 'districtName', 'districtCity', 'lat',
       'long', 'name', 'gradeLevels', 'address', 'rating', 'ratingScale',
       'schoolType', 'zipcode', 'studentsPerTeacher', 'parentRating',
       'schoolLevel'],
      dtype='object')


In [20]:
# Appending a classification column so I can combine them all into one
elem_school['schoolLevel'] = "Elementary School"
mid_school['schoolLevel'] = "Middle School"
high_school['schoolLevel'] = "High School"

In [21]:
schools = pd.concat([elem_school, mid_school, high_school], ignore_index=True)

#### Haystacks POI Data

In [22]:
# Converting JSON to DataFrames & Flattening the data
flattened_grocery = json_normalize(grocery, record_path=["responce", "results"], meta=[["index"], ["census_zcta_geoid"], ["params", "key"], ["params", "location"], ["params", "radius"], ["params", "type"], ["responce", "status"]], errors="ignore")
flattened_restaurant = json_normalize(restaurant, record_path=["responce", "results"], meta=[["index"], ["census_zcta_geoid"], ["params", "key"], ["params", "location"], ["params", "radius"], ["params", "type"], ["responce", "status"]], errors="ignore")

grocery = pd.DataFrame(flattened_grocery)
restaurant = pd.DataFrame(flattened_restaurant)

In [23]:
print('Grocery: ', grocery.shape)
print('\n', grocery.dtypes)
print('\n','-'*50,'\n')
print('Restuarant: ', restaurant.shape)
print('\n',restaurant.dtypes)

Grocery:  (402803, 31)

 icon                                object
icon_background_color               object
icon_mask_base_uri                  object
name                                object
place_id                            object
reference                           object
scope                               object
types                               object
vicinity                            object
geometry.location.lat              float64
geometry.location.lng              float64
geometry.viewport.northeast.lat    float64
geometry.viewport.northeast.lng    float64
geometry.viewport.southwest.lat    float64
geometry.viewport.southwest.lng    float64
business_status                     object
plus_code.compound_code             object
plus_code.global_code               object
permanently_closed                  object
photos                              object
rating                             float64
user_ratings_total                 float64
opening_hours.open_now       

In [24]:
# Converting the "census_zcta_geoid" column to integers
grocery["census_zcta_geoid"] = grocery["census_zcta_geoid"].astype(int)
restaurant["census_zcta_geoid"] = restaurant["census_zcta_geoid"].astype(int)

In [25]:
# Using the zip codes in this file to filter the grocery_df
zip_list = atlanta_geo["census_zcta5_geoid"].tolist() # works for both

# Creating a boolean mask
mask_grocery = grocery["census_zcta_geoid"].isin(zip_list)
mask_rest = restaurant["census_zcta_geoid"].isin(zip_list)

# Filtering the dataframe based on the boolean mask
grocery_atlanta = grocery[mask_grocery]
restaurant_atlanta = restaurant[mask_rest]

In [26]:
# Dropping columns that arent useful in grocery
grocery_atlanta = grocery_atlanta.drop(['icon', 'icon_background_color', 'params.location', 
                                              'params.key', 'params.radius', 'params.type', 'icon_mask_base_uri', 
                                              'reference', 'scope', 'index', 'responce.status'], 
                                             axis=1).reset_index(drop=True)

In [27]:
# Dropping columns that arent useful and resetting index in restatuarant
restaurant_atlanta = restaurant_atlanta.drop(['icon', 'icon_background_color', 'params.location',
                                              'params.key', 'params.radius', 'params.type', 'icon_mask_base_uri',
                                              'reference', 'scope', 'index', 'responce.status'], 
                                             axis=1).reset_index(drop=True)

In [28]:
# Combining the two dataframes by concating vertically into a Point of Interest DF
POI = pd.concat([restaurant_atlanta, grocery_atlanta], ignore_index=True)

In [29]:
# Checking or duplicates
POI.duplicated(subset=['place_id']).sum()

1075

In [30]:
#Dropping duplicates
POI.drop_duplicates(subset='place_id', inplace=True)

#### Clean up GA_listings

In [None]:
# Zip column is currently a string value
GA_listing.dtypes

In [None]:
# Converting the zip column to a numeric type and dropping any non-numeric or missing values
GA_listing['zip'] = pd.to_numeric(GA_listing['zip'], errors='coerce')
GA_listing = GA_listing.dropna(subset=['zip'])

# Converting the zip column to an integer
GA_listing.loc['zip'] = GA_listing['zip'].astype(int)

# Filtering GA_LISTINGS_SALES_V2_df to keep only the rows with Atlanta zip codes
GA_listing = GA_listing[GA_listing['zip'].isin(zip_list)]

In [None]:
GA_listing.isna().sum()

In [None]:
# How many rows are vacant land?
GA_listing["details"].str.contains("Lots/Land").sum()

In [None]:
# Dropping these since these sales wont help us
GA_listing = GA_listing[~GA_listing['details'].str.contains('Lots/Land')]

In [None]:
# About a hundred zip codes have less than 30 listings. We might remove these at a later point

value_counts = GA_listing["zip"].value_counts()
unique_values = value_counts[value_counts > 30].index
num_unique_values = len(unique_values)
counts_greater_than_30 = value_counts[value_counts > 30]

print(f"The number of original zip codes: {len(zip_list)}")
print(f"The number of zip codes with more than 30 observations: {num_unique_values}")
print(counts_greater_than_30)

In [None]:
GA_listing.zip = GA_listing.zip.astype(int)

## Uploading to AWS

#### DataFrame Names
- **POI**: Combination of `all_zips_grocery_store.csv` & `all_zips_restaurant.csv` provided by haystacks. 
    - **Key info**: `name`, `price_level`, `rating`, `types`, `user_ratings_total`, and location data
- **atlanta_geo**:
- **crime**:
- **schools**: Combination of `elementary_schools.csv` & `middle_schools.csv` & `high_schools.csv`
- **GA_listing**:

In [31]:
POI.shape

(5213, 20)

In [2]:
# Create an S3 resource instead of using the S3 client that was used earlier
s3R = boto3.resource('s3')

In [39]:
# Convert the DataFrame to a csv buffer in memory
csv_buffer1 = io.StringIO()
POI.to_csv(csv_buffer1, index=False)

csv_buffer2 = io.StringIO()
schools.to_csv(csv_buffer2, index=False)

csv_buffer3 = io.StringIO()
atlanta_geo.to_csv(csv_buffer3, index=False)

csv_buffer4 = io.StringIO()
crime.to_csv(csv_buffer4, index=False)

csv_buffer5 = io.StringIO()
GA_listing.to_csv(csv_buffer5, index=False)

In [40]:
# Naming the file for AWS storage
file_names = ['points-of-interest-haystacks.csv', 'schools.csv', 'atlanta-geo.csv', 'crime.csv', 'GA_listing.csv']
csv_buffers = [csv_buffer1, csv_buffer2, csv_buffer3, csv_buffer4, csv_buffer5]

# Assigning the output to a variable what is printed
for file, buffer in zip(file_names, csv_buffers):
    response = s3R.Object(bucket_name, file).put(Body=buffer.getvalue())
    print(f"File uploaded to S3 with ETag: {response['ETag']}") # Checking the status to confirm it was uploaded

File uploaded to S3 with ETag: "ea3cab13b605aadf8a7e836072048eb2"
File uploaded to S3 with ETag: "5b91420bcfc74a9e6a90e57c5ffd1039"
File uploaded to S3 with ETag: "1a1761370148f8140a003b3a6af44de6"
File uploaded to S3 with ETag: "e64d0585c6776fed9eb12462169e0ffc"
File uploaded to S3 with ETag: "6e95642a5123f2e881bbe580867a62e3"


In [36]:
# # Deleting any old files (Adapt file names to delete what you need)

# file_names = ['elementary_schools.csv', 'high_schools.csv', 'middle_schools.csv']
# bucket_name = 'capstonehaystacks'

# # Create an S3 client
# s3_client = boto3.client("s3")

# # Delete the file from the S3 bucket
# for file in file_names:
#     response = s3_client.delete_object(Bucket=bucket_name, Key=file)

In [4]:
s3R = boto3.resource('s3')
bucket_name = "capstonehaystacks"

# Listing the contents of the S3 bucket to ensure everything looks good
for obj in s3R.Bucket(bucket_name).objects.all():
    print(obj.key)


GA_LISTINGS_SALES.csv
GA_LISTINGS_SALES_V2.csv
GA_listing.csv
all_zips_grocery_store.json
all_zips_restaurant.json
atlanta-geo.csv
atlanta_cbsa_zip.csv
atlanta_hdma_2021.csv
atlanta_listings.csv
core_geo_dataset.csv
crime.csv
crime_rating_zipcode.csv
elementary_schools.csv
high_schools.csv
hmda_2021_ga_all-records_labels.csv
middle_schools.csv
points-of-interest-google.csv
points-of-interest-google2.csv
points-of-interest-haystacks.csv
schools.csv


## Summary statistics and Checking for missing values

### POI

In [77]:
print("Shape of POI data: ", POI.shape)
POI.head()

Shape of POI data:  (5213, 31)


Unnamed: 0,business_status,name,photos,place_id,price_level,rating,types,user_ratings_total,vicinity,geometry.location.lat,geometry.location.lng,geometry.viewport.northeast.lat,geometry.viewport.northeast.lng,geometry.viewport.southwest.lat,geometry.viewport.southwest.lng,opening_hours.open_now,plus_code.compound_code,plus_code.global_code,permanently_closed,census_zcta_geoid,icon,icon_background_color,icon_mask_base_uri,reference,scope,index,params.key,params.location,params.radius,params.type,responce.status
0,OPERATIONAL,SweetWater Brewing Company,"[{'height': 810, 'html_attributions': ['<a hre...",ChIJK_An47MF9YgRbWKKF680tH4,1.0,4.5,"[bar, restaurant, point_of_interest, food, est...",838.0,"195 Ottley Drive Northeast, Atlanta",33.808719,-84.380186,33.810568,-84.3793,33.80787,-84.381998,True,"RJ59+FW Atlanta, GA, USA",865QRJ59+FW,,30309,,,,,,,,,,,
1,OPERATIONAL,South City Kitchen Midtown,"[{'height': 433, 'html_attributions': ['<a hre...",ChIJvQqtX0ME9YgR-X6ukvNSt6I,2.0,4.6,"[restaurant, bar, point_of_interest, food, est...",2890.0,"1144 Crescent Avenue Northeast, Atlanta",33.785955,-84.384434,33.787298,-84.382997,33.7846,-84.385695,True,"QJP8+96 Atlanta, GA, USA",865QQJP8+96,,30309,,,,,,,,,,,
2,OPERATIONAL,Loca Luna,"[{'height': 3456, 'html_attributions': ['<a hr...",ChIJ-4pAMjoE9YgRyT9t2idf8go,2.0,4.2,"[night_club, restaurant, point_of_interest, fo...",2066.0,"550 Amsterdam Avenue Northeast C, Atlanta",33.788545,-84.369098,33.789868,-84.367841,33.78717,-84.370539,True,"QJQJ+C9 Atlanta, GA, USA",865QQJQJ+C9,,30309,,,,,,,,,,,
3,OPERATIONAL,Twelve Eighty,"[{'height': 2448, 'html_attributions': ['<a hr...",ChIJf8bUMEUE9YgR_DokvHkyRzU,3.0,4.2,"[restaurant, point_of_interest, food, establis...",187.0,"1280 Peachtree Street Northeast, Atlanta",33.789444,-84.385659,33.790444,-84.383911,33.787746,-84.386608,True,"QJQ7+QP Atlanta, GA, USA",865QQJQ7+QP,,30309,,,,,,,,,,,
4,OPERATIONAL,Fat Matt's Rib Shack,"[{'height': 2268, 'html_attributions': ['<a hr...",ChIJl7bHAc0F9YgR9iWasWEvqMk,2.0,4.5,"[restaurant, point_of_interest, food, establis...",4955.0,"1811 Piedmont Avenue Northeast, Atlanta",33.804608,-84.367114,33.805938,-84.365921,33.80324,-84.368619,True,"RJ3M+R5 Atlanta, GA, USA",865QRJ3M+R5,,30309,,,,,,,,,,,


In [79]:
print("Summary statistics of POI data: ")
POI.describe()

Summary statistics of POI data: 


Unnamed: 0,price_level,rating,user_ratings_total,geometry.location.lat,geometry.location.lng,geometry.viewport.northeast.lat,geometry.viewport.northeast.lng,geometry.viewport.southwest.lat,geometry.viewport.southwest.lng,census_zcta_geoid
count,2190.0,3021.0,3021.0,5213.0,5213.0,5213.0,5213.0,5213.0,5213.0,5213.0
mean,1.469406,4.04141,706.840781,33.756477,-84.377008,33.75877,-84.374387,33.754071,-84.3796,30263.870132
std,0.596721,0.607271,1406.201944,0.36722,0.389548,0.367397,0.389584,0.367198,0.389655,287.484158
min,0.0,1.0,1.0,32.799175,-85.349,32.805722,-85.34787,32.794141,-85.366802,30002.0
25%,1.0,3.8,69.0,33.552108,-84.603261,33.554162,-84.600904,33.549279,-84.606235,30092.0
50%,1.0,4.1,306.0,33.789186,-84.369977,33.790924,-84.367841,33.787016,-84.372133,30206.0
75%,2.0,4.4,831.0,33.976156,-84.144046,33.980267,-84.141908,33.970583,-84.146057,30318.0
max,4.0,5.0,36780.0,34.753441,-83.1686,34.754344,-83.138045,34.751646,-83.169578,31830.0


In [80]:
print("Missing values in POI data: ")
POI.isnull().sum()

Missing values in POI data: 


business_status                     197
name                                  0
photos                             3204
place_id                              0
price_level                        3023
rating                             2192
types                                 0
user_ratings_total                 2192
vicinity                              3
geometry.location.lat                 0
geometry.location.lng                 0
geometry.viewport.northeast.lat       0
geometry.viewport.northeast.lng       0
geometry.viewport.southwest.lat       0
geometry.viewport.southwest.lng       0
opening_hours.open_now             1414
plus_code.compound_code             219
plus_code.global_code               219
permanently_closed                 4980
census_zcta_geoid                     0
icon                               2126
icon_background_color              2126
icon_mask_base_uri                 2126
reference                          2126
scope                              2126


### geo

In [81]:
print("Shape of geo data: ", geo.shape)
geo.head()

Shape of geo data:  (136473, 7)


Unnamed: 0.1,Unnamed: 0,census_zcta5_geoid,census_zcta5_lat,census_zcta5_lon,census_tract_geoid,census_tract_lat,census_tract_lon
0,0,47236,39.151743,-85.725277,18005011300,39.20876,-85.760188
1,1,47283,39.170024,-85.587896,18005011300,39.20876,-85.760188
2,2,47203,39.230418,-85.832263,18005011300,39.20876,-85.760188
3,3,47232,39.110924,-85.790836,18005011300,39.20876,-85.760188
4,4,47201,39.148221,-85.999407,18005011300,39.20876,-85.760188


In [82]:
print("Summary statistics of geo data: ")
geo.describe()

Summary statistics of geo data: 


Unnamed: 0.1,Unnamed: 0,census_zcta5_geoid,census_zcta5_lat,census_zcta5_lon,census_tract_geoid,census_tract_lat,census_tract_lon
count,136473.0,136473.0,136473.0,136473.0,136473.0,136473.0,136473.0
mean,68236.0,48801.475024,38.257041,-90.396386,29215220000.0,38.256722,-90.38296
std,39396.505981,28196.92198,5.254177,15.114411,15921410000.0,5.252617,15.190555
min,0.0,601.0,17.72471,-176.668632,1001020000.0,17.694722,-166.770979
25%,34118.0,25169.0,34.778047,-96.80303,17035970000.0,34.774634,-96.798224
50%,68236.0,47946.0,39.366217,-86.767472,29123960000.0,39.366052,-86.763794
75%,102354.0,72677.0,41.69396,-79.847468,42011010000.0,41.691306,-79.847487
max,136472.0,99929.0,71.253861,-64.708215,78030960000.0,71.287519,178.338813


In [83]:
print("Missing values in geo data: ")
geo.isnull().sum()

Missing values in geo data: 


Unnamed: 0            0
census_zcta5_geoid    0
census_zcta5_lat      0
census_zcta5_lon      0
census_tract_geoid    0
census_tract_lat      0
census_tract_lon      0
dtype: int64

### atlanta_geo

In [84]:
print("Shape of atlanta_geo data: ", atlanta_geo.shape)
atlanta_geo.head()

Shape of atlanta_geo data:  (241, 9)


Unnamed: 0.1,Unnamed: 0,census_cbsa_geoid,census_cbsa_name,census_cbsa_lsad,census_cbsa_lat,census_cbsa_lon,census_zcta5_geoid,census_zcta5_lat,census_zcta5_lon
0,3208,12060,"Atlanta-Sandy Springs-Alpharetta, GA",M1,33.693728,-84.399911,30309,33.799851,-84.385837
1,3219,12060,"Atlanta-Sandy Springs-Alpharetta, GA",M1,33.693728,-84.399911,30517,34.130296,-83.797289
2,3222,12060,"Atlanta-Sandy Springs-Alpharetta, GA",M1,33.693728,-84.399911,30548,34.089396,-83.763599
3,3225,12060,"Atlanta-Sandy Springs-Alpharetta, GA",M1,33.693728,-84.399911,30518,34.133822,-84.022599
4,3227,12060,"Atlanta-Sandy Springs-Alpharetta, GA",M1,33.693728,-84.399911,30334,33.748652,-84.387894


In [85]:
print("Summary statistics of atlanta_geo data: ")
atlanta_geo.describe()

Summary statistics of atlanta_geo data: 


Unnamed: 0.1,Unnamed: 0,census_cbsa_geoid,census_cbsa_lat,census_cbsa_lon,census_zcta5_geoid,census_zcta5_lat,census_zcta5_lon
count,241.0,241.0,241.0,241.0,241.0,241.0,241.0
mean,4411.946058,12060.0,33.69373,-84.39991,30268.431535,33.742742,-84.365478
std,749.683606,0.0,1.352841e-13,1.851256e-13,277.487365,0.367733,0.398738
min,3208.0,12060.0,33.69373,-84.39991,30002.0,32.829319,-85.30439
25%,3708.0,12060.0,33.69373,-84.39991,30096.0,33.539118,-84.591033
50%,4486.0,12060.0,33.69373,-84.39991,30218.0,33.773335,-84.373997
75%,5072.0,12060.0,33.69373,-84.39991,30317.0,33.953141,-84.145978
max,5685.0,12060.0,33.69373,-84.39991,31830.0,34.728127,-83.183971


In [86]:
print("Missing values in atlanta_geo data: ")
atlanta_geo.isnull().sum()

Missing values in atlanta_geo data: 


Unnamed: 0            0
census_cbsa_geoid     0
census_cbsa_name      0
census_cbsa_lsad      0
census_cbsa_lat       0
census_cbsa_lon       0
census_zcta5_geoid    0
census_zcta5_lat      0
census_zcta5_lon      0
dtype: int64

### crime

In [87]:
print("Shape of crime data: ", crime.shape)
crime.head()

Shape of crime data:  (726, 27)


Unnamed: 0.1,Unnamed: 0,census_state_abbr,census_zcta5_geoid,census_cbsa_geoid_count,census_cbsa_geoid_list,overall_crime_grade,violent_crime_grade,violent_crime_total_rate,violent_crime_assault_rate,violent_crime_robbery_rate,violent_crime_rape_rate,violent_crime_murder_rate,property_crime_grade,property_crime_total_rate,property_crime_theft_rate,property_crime_vehicle_theft_rate,property_crime_burglary_rate,property_crime_arson_rate,other_crime_grade,other_crime_total_rate,other_crime_kidnapping_rate,other_crime_drug_crimes_rate,other_crime_vandalism_rate,other_crime_identity_theft_rate,other_crime_animal_cruelty_rate,crime_safety_paragraph,interpreting_crime_map_paragraph
0,13719,GA,30002,1,12060,D-,C,3.106,1.706,1.067,0.2545,0.0783,F,52.09,43.96,3.687,4.337,0.1075,B+,5.987,0.1236,0.9456,3.981,0.8824,0.0542,The D- grade means the rate of crime is much h...,"When looking at the crime map for 30002, remem..."
1,13720,GA,30004,1,12060,C-,A-,1.553,0.914,0.3858,0.224,0.0296,D-,32.63,29.14,1.124,2.263,0.1017,A+,3.171,0.0404,0.7863,1.524,0.8159,0.0042,The C- grade means the rate of crime is slight...,"When looking at the crime map for 30004, remem..."
2,13721,GA,30005,1,12060,B+,A-,1.48,0.7977,0.4288,0.2256,0.0275,C,15.39,11.97,1.468,1.844,0.1118,A+,2.709,0.0512,0.3942,1.394,0.8653,0.0041,The B+ grade means the rate of crime is lower ...,"When looking at the crime map for 30005, remem..."
3,13722,GA,30008,1,12060,D+,C,2.91,1.204,1.394,0.2531,0.0594,D,28.26,20.84,2.066,5.261,0.0946,B-,7.648,0.0883,0.909,5.777,0.8214,0.0522,The D+ grade means the rate of crime is higher...,"When looking at the crime map for 30008, remem..."
4,13723,GA,30009,1,12060,D-,A-,1.611,1.046,0.295,0.2294,0.04,F,58.32,54.14,1.474,2.584,0.1224,A,4.314,0.0453,0.8082,2.565,0.891,0.0044,The D- grade means the rate of crime is much h...,"When looking at the crime map for 30009, remem..."


In [88]:
print("Summary statistics of crime data: ")
crime.describe()

Summary statistics of crime data: 


Unnamed: 0.1,Unnamed: 0,census_zcta5_geoid,census_cbsa_geoid_count,violent_crime_total_rate,violent_crime_assault_rate,violent_crime_robbery_rate,violent_crime_rape_rate,violent_crime_murder_rate,property_crime_total_rate,property_crime_theft_rate,property_crime_vehicle_theft_rate,property_crime_burglary_rate,property_crime_arson_rate,other_crime_total_rate,other_crime_kidnapping_rate,other_crime_drug_crimes_rate,other_crime_vandalism_rate,other_crime_identity_theft_rate,other_crime_animal_cruelty_rate
count,726.0,726.0,726.0,726.0,726.0,726.0,726.0,726.0,726.0,726.0,726.0,726.0,726.0,725.0,726.0,726.0,725.0,726.0,726.0
mean,14081.5,31192.415978,1.057851,3.628924,2.415621,0.909414,0.256657,0.047235,30.323595,21.552821,2.232393,6.452248,0.086279,10.496128,0.158867,2.227102,7.233889,0.837094,0.043337
std,209.722436,1886.384965,0.602947,2.636762,2.004043,0.972578,0.060357,0.078517,28.738825,24.986148,2.807408,6.357703,0.07818,10.331241,0.22226,2.257033,9.603389,0.156147,0.041284
min,13719.0,30002.0,0.0,0.9438,0.0771,0.0098,0.2209,0.001,5.376,0.6119,0.5714,1.704,0.0193,1.96,0.0049,0.006,0.1684,0.7332,0.0021
25%,13900.25,30315.25,1.0,2.21025,1.39425,0.309775,0.241725,0.0254,17.155,10.13,1.132,4.57,0.056725,5.755,0.050225,0.9007,3.363,0.750025,0.008525
50%,14081.5,30723.0,1.0,2.9115,1.9365,0.62225,0.25,0.03025,22.94,14.805,1.532,5.7965,0.07435,8.38,0.08095,1.516,5.303,0.785,0.0327
75%,14262.75,31515.25,1.0,4.28375,2.8385,1.19575,0.261375,0.03955,33.3575,24.15,2.2465,6.846,0.094975,12.53,0.1568,2.75575,8.589,0.868625,0.063
max,14444.0,39897.0,4.0,34.39,29.96,15.25,1.706,1.145,465.3,402.0,24.05,101.1,1.67,189.4,2.568,16.62,186.5,3.055,0.2293


In [89]:
print("Missing values in crime data: ")
crime.isnull().sum()

Missing values in crime data: 


Unnamed: 0                             0
census_state_abbr                      0
census_zcta5_geoid                     0
census_cbsa_geoid_count                0
census_cbsa_geoid_list                 0
overall_crime_grade                    0
violent_crime_grade                    0
violent_crime_total_rate               0
violent_crime_assault_rate             0
violent_crime_robbery_rate             0
violent_crime_rape_rate                0
violent_crime_murder_rate              0
property_crime_grade                   0
property_crime_total_rate              0
property_crime_theft_rate              0
property_crime_vehicle_theft_rate      0
property_crime_burglary_rate           0
property_crime_arson_rate              0
other_crime_grade                      1
other_crime_total_rate                 1
other_crime_kidnapping_rate            0
other_crime_drug_crimes_rate           0
other_crime_vandalism_rate             1
other_crime_identity_theft_rate        0
other_crime_anim

### elem_school

In [90]:
print("Shape of elem_school data: ", elem_school.shape)
elem_school.head()

Shape of elem_school data:  (2719, 16)


Unnamed: 0.1,Unnamed: 0,id,districtID,districtName,districtCity,lat,long,name,gradeLevels,address,rating,ratingScale,schoolType,zipcode,studentsPerTeacher,parentRating
0,0,1038,73,Fulton County School District,Atlanta,34.071564,-84.363319,Sweet Apple Elementary School,PK-5,"12025 Etris Road, 30075, Roswell",10.0,Above average,public,30075,15.0,4
1,0,1108,82,Gwinnett County School District,Lawrenceville,34.044102,-83.924065,Fort Daniel Elementary School,PK-5,"1725 Auburn Road, 30019, Dacula",10.0,Above average,public,30019,14.0,5
2,0,1111,82,Gwinnett County School District,Lawrenceville,33.996933,-83.93721,Dyer Elementary School,PK-5,"1707 Hurricane Shoals Road Northeast, 30019, D...",10.0,Above average,public,30019,15.0,3
3,0,1503,124,Oconee County Schools,Watkinsville,33.914272,-83.506477,Malcom Bridge Elementary School,K-5,"2600 Malcom Bridge Road, 30622, Bogart",10.0,Above average,public,30622,16.0,4
4,0,3690,44,Columbia County School District,Evans,33.573559,-82.107941,River Ridge Elementary School,PK-5,"4109 Mullikin Road, 30809, Evans",10.0,Above average,public,30809,15.0,4


In [91]:
print("Summary statistics of elem_school data: ")
elem_school.describe()

Summary statistics of elem_school data: 


Unnamed: 0.1,Unnamed: 0,id,districtID,lat,long,rating,zipcode,studentsPerTeacher,parentRating
count,2719.0,2719.0,2719.0,2719.0,2719.0,1282.0,2719.0,1385.0,2719.0
mean,0.0,3531.301949,40.006252,33.351058,-83.879605,5.24727,30661.876793,14.462816,2.826775
std,0.0,2782.309431,53.992392,0.977093,1.005372,1.926157,1079.14065,2.659417,1.989432
min,0.0,2.0,0.0,25.530533,-85.548866,1.0,30002.0,1.0,0.0
25%,0.0,1286.0,0.0,32.870861,-84.481625,4.0,30114.0,13.0,0.0
50%,0.0,2478.0,4.0,33.708672,-84.212189,5.0,30324.0,15.0,4.0
75%,0.0,6225.5,73.0,33.963186,-83.648273,7.0,31005.5,16.0,4.0
max,0.0,9053.0,270.0,34.977684,-80.488266,10.0,39897.0,47.0,5.0


In [92]:
print("Missing values in elem_school data: ")
elem_school.isnull().sum()

Missing values in elem_school data: 


Unnamed: 0               0
id                       0
districtID               0
districtName          1309
districtCity          1309
lat                      0
long                     0
name                     0
gradeLevels              0
address                  0
rating                1437
ratingScale           1437
schoolType               0
zipcode                  0
studentsPerTeacher    1334
parentRating             0
dtype: int64

### mid_school

In [93]:
print("Shape of mid_school data: ", mid_school.shape)
mid_school.head()

Shape of mid_school data:  (1441, 16)


Unnamed: 0.1,Unnamed: 0,id,districtID,districtName,districtCity,lat,long,name,gradeLevels,address,rating,ratingScale,schoolType,zipcode,studentsPerTeacher,parentRating
0,0,2753,58,Dougherty County School District,Albany,31.57345,-84.249695,Robert A. Cross Middle Magnet,6-8,"324 Lockett Station Road, 31721, Albany",10.0,Above average,public,31721,15.0,3
1,0,931,69,Fayette County School District,Fayetteville,33.342094,-84.519585,Rising Starr Middle School,6-8,"183 Panther Path, 30215, Fayetteville",10.0,Above average,public,30215,15.0,3
2,0,1012,73,Fulton County School District,Atlanta,34.067436,-84.231171,Webb Bridge Middle School,6-8,"4455 Webb Bridge Road, 30005, Alpharetta",9.0,Above average,public,30005,14.0,5
3,0,1033,73,Fulton County School District,Atlanta,34.052856,-84.209183,Taylor Road Middle School,6-8,"5150 Taylor Rd, 30022, Alpharetta",9.0,Above average,public,30022,15.0,3
4,0,1614,138,Richmond County School District,Augusta,33.473846,-81.975899,Davidson Magnet School,6-12,"615 12th Street, 30901, Augusta",9.0,Above average,public,30901,18.0,5


In [94]:
print("Summary statistics of mid_school edata: ")
mid_school.describe()

Summary statistics of mid_school edata: 


Unnamed: 0.1,Unnamed: 0,id,districtID,lat,long,rating,zipcode,studentsPerTeacher,parentRating
count,1441.0,1441.0,1441.0,1441.0,1441.0,578.0,1441.0,619.0,1441.0
mean,0.0,3801.254684,36.343511,33.289537,-83.825294,5.204152,30735.346287,14.752827,2.508675
std,0.0,2716.740642,55.350418,1.001791,1.046561,1.928492,1224.062897,3.678236,1.930051
min,0.0,1.0,0.0,25.530533,-85.518494,1.0,30002.0,1.0,0.0
25%,0.0,1917.0,0.0,32.645199,-84.465767,4.0,30127.0,14.0,0.0
50%,0.0,2863.0,0.0,33.643238,-84.196259,5.0,30342.0,15.0,3.0
75%,0.0,6375.0,66.0,33.949112,-83.50473,7.0,31054.0,16.0,4.0
max,0.0,9047.0,267.0,34.981781,-80.488266,10.0,39897.0,37.0,5.0


In [95]:
print("Missing values in mid_school data: ")
mid_school.isnull().sum()

Missing values in mid_school data: 


Unnamed: 0              0
id                      0
districtID              0
districtName          808
districtCity          808
lat                     0
long                    0
name                    0
gradeLevels             0
address                 0
rating                863
ratingScale           863
schoolType              0
zipcode                 0
studentsPerTeacher    822
parentRating            0
dtype: int64

### high_school

In [96]:
print("Shape of high_school data: ", high_school.shape)
crime.head()

Shape of high_school data:  (1104, 16)


Unnamed: 0.1,Unnamed: 0,census_state_abbr,census_zcta5_geoid,census_cbsa_geoid_count,census_cbsa_geoid_list,overall_crime_grade,violent_crime_grade,violent_crime_total_rate,violent_crime_assault_rate,violent_crime_robbery_rate,violent_crime_rape_rate,violent_crime_murder_rate,property_crime_grade,property_crime_total_rate,property_crime_theft_rate,property_crime_vehicle_theft_rate,property_crime_burglary_rate,property_crime_arson_rate,other_crime_grade,other_crime_total_rate,other_crime_kidnapping_rate,other_crime_drug_crimes_rate,other_crime_vandalism_rate,other_crime_identity_theft_rate,other_crime_animal_cruelty_rate,crime_safety_paragraph,interpreting_crime_map_paragraph
0,13719,GA,30002,1,12060,D-,C,3.106,1.706,1.067,0.2545,0.0783,F,52.09,43.96,3.687,4.337,0.1075,B+,5.987,0.1236,0.9456,3.981,0.8824,0.0542,The D- grade means the rate of crime is much h...,"When looking at the crime map for 30002, remem..."
1,13720,GA,30004,1,12060,C-,A-,1.553,0.914,0.3858,0.224,0.0296,D-,32.63,29.14,1.124,2.263,0.1017,A+,3.171,0.0404,0.7863,1.524,0.8159,0.0042,The C- grade means the rate of crime is slight...,"When looking at the crime map for 30004, remem..."
2,13721,GA,30005,1,12060,B+,A-,1.48,0.7977,0.4288,0.2256,0.0275,C,15.39,11.97,1.468,1.844,0.1118,A+,2.709,0.0512,0.3942,1.394,0.8653,0.0041,The B+ grade means the rate of crime is lower ...,"When looking at the crime map for 30005, remem..."
3,13722,GA,30008,1,12060,D+,C,2.91,1.204,1.394,0.2531,0.0594,D,28.26,20.84,2.066,5.261,0.0946,B-,7.648,0.0883,0.909,5.777,0.8214,0.0522,The D+ grade means the rate of crime is higher...,"When looking at the crime map for 30008, remem..."
4,13723,GA,30009,1,12060,D-,A-,1.611,1.046,0.295,0.2294,0.04,F,58.32,54.14,1.474,2.584,0.1224,A,4.314,0.0453,0.8082,2.565,0.891,0.0044,The D- grade means the rate of crime is much h...,"When looking at the crime map for 30009, remem..."


In [97]:
print("Summary statistics of high_school data: ")
high_school.describe()

Summary statistics of high_school data: 


Unnamed: 0.1,Unnamed: 0,id,districtID,lat,long,rating,zipcode,studentsPerTeacher,parentRating
count,1104.0,1104.0,1104.0,1104.0,1104.0,447.0,1104.0,488.0,1104.0
mean,0.0,3749.912138,36.835145,33.269079,-83.813699,5.0,30763.145833,15.89959,2.531703
std,0.0,2785.346754,53.670818,1.023405,1.035285,1.974047,1273.091013,5.763669,1.933831
min,0.0,3.0,0.0,25.530533,-85.508987,1.0,30002.0,1.0,0.0
25%,0.0,1800.5,0.0,32.571535,-84.449206,4.0,30135.0,15.0,0.0
50%,0.0,2573.5,0.0,33.612899,-84.164024,5.0,30349.5,16.0,3.0
75%,0.0,6492.25,70.0,33.955349,-83.47591,6.0,31063.0,18.0,4.0
max,0.0,9047.0,268.0,34.977684,-80.488266,10.0,39886.0,78.0,5.0


In [98]:
print("Missing values in high_school data: ")
high_school.isnull().sum()

Missing values in high_school data: 


Unnamed: 0              0
id                      0
districtID              0
districtName          601
districtCity          601
lat                     0
long                    0
name                    0
gradeLevels             0
address                 0
rating                657
ratingScale           657
schoolType              0
zipcode                 0
studentsPerTeacher    616
parentRating            0
dtype: int64

### GA_listing

In [99]:
print("Shape of GA_listing data: ", GA_listing.shape)
crime.head()

Shape of GA_listing data:  (31064, 22)


Unnamed: 0.1,Unnamed: 0,census_state_abbr,census_zcta5_geoid,census_cbsa_geoid_count,census_cbsa_geoid_list,overall_crime_grade,violent_crime_grade,violent_crime_total_rate,violent_crime_assault_rate,violent_crime_robbery_rate,violent_crime_rape_rate,violent_crime_murder_rate,property_crime_grade,property_crime_total_rate,property_crime_theft_rate,property_crime_vehicle_theft_rate,property_crime_burglary_rate,property_crime_arson_rate,other_crime_grade,other_crime_total_rate,other_crime_kidnapping_rate,other_crime_drug_crimes_rate,other_crime_vandalism_rate,other_crime_identity_theft_rate,other_crime_animal_cruelty_rate,crime_safety_paragraph,interpreting_crime_map_paragraph
0,13719,GA,30002,1,12060,D-,C,3.106,1.706,1.067,0.2545,0.0783,F,52.09,43.96,3.687,4.337,0.1075,B+,5.987,0.1236,0.9456,3.981,0.8824,0.0542,The D- grade means the rate of crime is much h...,"When looking at the crime map for 30002, remem..."
1,13720,GA,30004,1,12060,C-,A-,1.553,0.914,0.3858,0.224,0.0296,D-,32.63,29.14,1.124,2.263,0.1017,A+,3.171,0.0404,0.7863,1.524,0.8159,0.0042,The C- grade means the rate of crime is slight...,"When looking at the crime map for 30004, remem..."
2,13721,GA,30005,1,12060,B+,A-,1.48,0.7977,0.4288,0.2256,0.0275,C,15.39,11.97,1.468,1.844,0.1118,A+,2.709,0.0512,0.3942,1.394,0.8653,0.0041,The B+ grade means the rate of crime is lower ...,"When looking at the crime map for 30005, remem..."
3,13722,GA,30008,1,12060,D+,C,2.91,1.204,1.394,0.2531,0.0594,D,28.26,20.84,2.066,5.261,0.0946,B-,7.648,0.0883,0.909,5.777,0.8214,0.0522,The D+ grade means the rate of crime is higher...,"When looking at the crime map for 30008, remem..."
4,13723,GA,30009,1,12060,D-,A-,1.611,1.046,0.295,0.2294,0.04,F,58.32,54.14,1.474,2.584,0.1224,A,4.314,0.0453,0.8082,2.565,0.891,0.0044,The D- grade means the rate of crime is much h...,"When looking at the crime map for 30009, remem..."


In [100]:
print("Summary statistics of GA_listing data: ")
GA_listing.describe()

Summary statistics of GA_listing data: 


Unnamed: 0.1,Unnamed: 0,latitude,longitude,beds,baths_full,baths_half,square_footage,lot_size,year_built,special_features,unit_count,price,transaction_type,listing_status,listing_special_features
count,31064.0,31064.0,31064.0,14167.0,13978.0,5855.0,12678.0,28387.0,16620.0,31064.0,219.0,31064.0,31064.0,31064.0,31064.0
mean,15531.5,33.667087,-83.793974,3.600692,2.51395,1.103501,3216.709,19.311093,1985.574489,2.511879,3.52968,494225.0,1.0,0.958891,2.511879
std,8967.548717,0.957898,0.996623,1.366697,1.253564,0.528206,25468.56,657.400055,34.32965,34.077367,6.351536,1332321.0,0.0,0.198545,34.077367
min,0.0,30.361079,-85.522663,0.0,0.0,0.0,0.0,0.0,1700.0,0.0,1.0,1.0,1.0,0.0,0.0
25%,7765.75,33.302195,-84.428492,3.0,2.0,1.0,1450.0,0.0,1967.0,0.0,2.0,90000.0,1.0,1.0,0.0
50%,15531.5,33.825564,-84.015629,3.0,2.0,1.0,2052.0,1.0,1994.0,0.0,2.0,285000.0,1.0,1.0,0.0
75%,23297.25,34.323055,-83.461731,4.0,3.0,1.0,2952.0,3.0,2010.0,0.0,4.0,522946.8,1.0,1.0,0.0
max,31063.0,34.995653,-80.84198,71.0,56.0,10.0,2090880.0,73616.0,2023.0,514.0,89.0,162627000.0,1.0,1.0,514.0


In [101]:
print("Missing values in GA_listing data: ")
GA_listing.isnull().sum()

Missing values in GA_listing data: 


Unnamed: 0                      0
latitude                        0
longitude                       0
full_street_address             0
city                            0
county_name                     0
beds                        16897
baths_full                  17086
baths_half                  25209
square_footage              18386
lot_size                     2677
year_built                  14444
details                         0
special_features                0
unit_count                  30845
price                           0
transaction_type                0
listing_status                  0
listing_special_features        0
census_state_name               0
census_county_name              0
zip                             0
dtype: int64