## This notebook is for the POI csv's from Google places

In [36]:
import pandas as pd
import numpy as np
import io
import boto3 # AWS


# the below extension properly formats a cell after it is run
%load_ext nb_black 

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)
import pandas as pd

# Set the maximum number of columns to 200
pd.set_option('display.max_columns', 200)


The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

Combining all of the CSV's I pulled from Google Places

In [15]:
# creating an empty DataFrame
df = pd.DataFrame()

# POI files
file_names = ['poi_30039_30076.csv', 'poi_30263_30303.csv', 'poi_30078_30113.csv',
              'poi_30114_30161.csv', 'poi_30218_30260.csv', 'poi_30168_30217.csv', 'poi_30002_30038.csv']

# looping through the file names and appending the data to the df
for file in file_names:
    data = pd.read_csv(file)
    df = pd.concat([df, data], ignore_index=True)

# save the combined DataFrame to a new CSV file
df.to_csv('combined_poi.csv', index=False)


<IPython.core.display.Javascript object>

In [16]:
df.isna().sum()

place_id                   0
name                       1
rating                    12
user_ratings_total        12
latitude                   0
longitude                  0
poi_type                   0
zip_code                   0
area_radius           115627
dtype: int64

<IPython.core.display.Javascript object>

In [4]:
df["poi_type"].value_counts()

park                  8641
cafe                  8378
gym                   8180
atm                   7737
bar                   7387
university            7322
spa                   7251
restaurant            6992
store                 6864
laundry               6745
transit_station       6159
supermarket           5393
storage               4929
parking               4747
bus_station           4629
bicycle_store         4390
night_club            3838
bakery                3836
movie_theater         3707
beauty_salon          2925
clothing_store        2679
train_station         2176
library               2041
liquor_store          1779
convenience_store     1546
car_rental             667
pet_store              253
real_estate_agency     155
book_store             117
subway_station          91
light_rail_station      34
art_gallery              2
Name: poi_type, dtype: int64

<IPython.core.display.Javascript object>

In [21]:
df_no_na = df.copy()

<IPython.core.display.Javascript object>

In [22]:
# dropping the rows with missing values
df_no_na.dropna(subset=["name", "rating", "user_ratings_total"], inplace=True)

# Area_radius was only used for large places such as parks and universities
df_no_na.drop(columns=['area_radius'], inplace=True)

# reset the row index of the cleaned DataFrame
df_no_na.reset_index(drop=True, inplace=True)



<IPython.core.display.Javascript object>

In [23]:
df_no_na.isna().sum()

place_id              0
name                  0
rating                0
user_ratings_total    0
latitude              0
longitude             0
poi_type              0
zip_code              0
dtype: int64

<IPython.core.display.Javascript object>

In [24]:
df_no_na.shape

(131577, 8)

<IPython.core.display.Javascript object>

In [25]:
# Looking at the amount of places with no user ratings
num_zeros = (df_no_na['user_ratings_total'] == 0).sum()
num_non_zeros = (df_no_na['user_ratings_total'] != 0).sum()
print(f"Number of 0 values: {num_zeros}")
print(f"Number of non-zero values: {num_non_zeros}")


Number of 0 values: 25095
Number of non-zero values: 106482


<IPython.core.display.Javascript object>

In [26]:
# Looking at the amount of places with more than two user ratings
num_less_than_or_equal_to_2 = (df_no_na['user_ratings_total'] <= 2).sum()
num_greater_than_2 = (df_no_na['user_ratings_total'] > 2).sum()
print(f"Number of values <= 2: {num_less_than_or_equal_to_2}")
print(f"Number of values > 2: {num_greater_than_2}")


Number of values <= 2: 35510
Number of values > 2: 96067


<IPython.core.display.Javascript object>

In [27]:
# Places with more than zero ratings 

nonzero_ratings = df_no_na[df_no_na['user_ratings_total'] != 0]
name_counts = nonzero_ratings['name'].value_counts()
print(name_counts)


Starbucks                                  570
Waffle House                               536
Dunkin'                                    481
Public Storage                             467
Kroger                                     459
                                          ... 
Sun & Bloom Co                               1
The Stock Market Country Store               1
Millionheir Kids Cartel                      1
Corner's Edge Butcher Shoppe of Conyers      1
Tesla                                        1
Name: name, Length: 26504, dtype: int64


<IPython.core.display.Javascript object>

In [28]:
# Looks like I have a bunch of duplicate place_id values even though it should be a
# unique identifier
num_duplicates = df_no_na.duplicated(subset=['place_id']).sum()
print(f"Number of duplicates in 'place_id' column: {num_duplicates}")


Number of duplicates in 'place_id' column: 86043


<IPython.core.display.Javascript object>

In [29]:
# Odd that there are not duplicate observations even though there are duplicate place_id
num_duplicates = df_no_na.duplicated().sum()
print(f"Number of duplicate observations: {num_duplicates}")


Number of duplicate observations: 0


<IPython.core.display.Javascript object>

In [30]:
# Dropping them for now
df_no_duplicates = df_no_na.drop_duplicates(subset=['place_id'], keep='first')


<IPython.core.display.Javascript object>

In [31]:
# I'm not sure why there are so many waffle houses but google says this number is 
# about right
nonzero_ratings = df_no_duplicates[df_no_duplicates['user_ratings_total'] != 0]
name_counts = nonzero_ratings['name'].value_counts()
print(name_counts)


Waffle House                               222
Wells Fargo ATM                            217
Starbucks                                  217
Dollar General                             190
McDonald's                                 184
                                          ... 
South Cobb Regional Library                  1
Northside Branch Library                     1
Joan P. Garner Library at Ponce De Leon      1
Martin Luther King Jr. Branch                1
Tesla                                        1
Name: name, Length: 26503, dtype: int64


<IPython.core.display.Javascript object>

In [32]:
df_no_duplicates.shape

(45534, 8)

<IPython.core.display.Javascript object>

In [33]:
df_no_duplicates["poi_type"].value_counts()

store                 4561
atm                   4345
bar                   3706
cafe                  3588
spa                   2890
park                  2564
gym                   2533
transit_station       2422
restaurant            2138
bakery                1916
laundry               1790
storage               1774
beauty_salon          1743
parking               1702
supermarket           1320
clothing_store        1156
university            1032
convenience_store      814
library                591
liquor_store           489
night_club             397
bicycle_store          388
movie_theater          387
bus_station            366
train_station          291
car_rental             253
pet_store              131
real_estate_agency     118
book_store              67
subway_station          58
art_gallery              2
light_rail_station       2
Name: poi_type, dtype: int64

<IPython.core.display.Javascript object>

In [34]:
# Places with more than zero ratings

nonzero_ratings = df_no_duplicates[df_no_duplicates["user_ratings_total"] != 0]
name_counts = nonzero_ratings["poi_type"].value_counts()
print(name_counts)

store                 3782
bar                   3533
cafe                  3276
spa                   2729
gym                   2169
park                  2151
restaurant            2032
laundry               1686
bakery                1608
beauty_salon          1605
atm                   1508
storage               1395
parking               1165
supermarket           1163
clothing_store        1081
convenience_store      738
university             582
liquor_store           486
library                457
transit_station        404
bicycle_store          362
movie_theater          346
night_club             293
car_rental             244
train_station          185
bus_station            165
pet_store              131
real_estate_agency     100
book_store              64
subway_station          54
art_gallery              2
light_rail_station       1
Name: poi_type, dtype: int64


<IPython.core.display.Javascript object>

Saving to AWS

In [39]:
# Converting the DataFrame to a csv buffer in memory
csv_buffer = io.StringIO()
df_no_duplicates.to_csv(csv_buffer, index=False)

# Create an S3 resource
s3R = boto3.resource('s3')

# Specifying the name of the bucket
bucket_name = 'capstonehaystacks'

# Naming the file for AWS storage
file_name = 'points-of-interest-google.csv'

# Assigning the output to a variable what is printed
response = s3R.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue())

# Checking the status to confirm it was uploaded
print(f"File uploaded to S3 with ETag: {response['ETag']}")


File uploaded to S3 with ETag: "bcb444534e874677527a4f8eec932d2f"


<IPython.core.display.Javascript object>

In [40]:
# Listing the contents of the S3 bucket  to ensure it was uploaded
for obj in s3R.Bucket(bucket_name).objects.all():
    print(obj.key)

GA_LISTINGS_SALES.csv
GA_LISTINGS_SALES_V2.csv
all_zips_grocery_store.json
all_zips_restaurant.json
atlanta_cbsa_zip.csv
atlanta_listings.csv
combined_poi.csv
core_geo_dataset.csv
crime_rating_zipcode.csv
elementary_schools.csv
high_schools.csv
middle_schools.csv
poi_df.csv
points-of-interest-google.csv
points-of-interest-haystacks.csv


<IPython.core.display.Javascript object>

In [44]:
# Deleting the old file that this replaces
file_name = "poi_df.csv"

# Create an S3 client
s3_client = boto3.client("s3")

# Delete the file from the S3 bucket
response = s3_client.delete_object(Bucket=bucket_name, Key=file_name)

<IPython.core.display.Javascript object>

In [46]:
# Listing the contents of the S3 bucket  to ensure it was uploaded
for obj in s3R.Bucket(bucket_name).objects.all():
    print(obj.key)

GA_LISTINGS_SALES.csv
GA_LISTINGS_SALES_V2.csv
all_zips_grocery_store.json
all_zips_restaurant.json
atlanta_cbsa_zip.csv
atlanta_listings.csv
core_geo_dataset.csv
crime_rating_zipcode.csv
elementary_schools.csv
high_schools.csv
middle_schools.csv
points-of-interest-google.csv
points-of-interest-haystacks.csv


<IPython.core.display.Javascript object>

In [49]:
df_no_duplicates.columns

Index(['place_id', 'name', 'rating', 'user_ratings_total', 'latitude',
       'longitude', 'poi_type', 'zip_code'],
      dtype='object')

<IPython.core.display.Javascript object>