In [None]:
import numpy as np
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
%matplotlib inline
import pandas as pd
import censusdata
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
pd.set_option('display.max_colwidth', None)
pd.set_option('float_format', '{:f}'.format)

In [None]:
# searcnig tables with income ACS 5 for 2018 
sample = censusdata.search('acs5', 2018,'concept', 'median household income')

In [None]:
len(sample)

In [None]:
sample[0:10]

In [None]:
# Printing table
censusdata.printtable(censusdata.censustable('acs5', 2018, 'B15003'))

In [None]:
#finding geography
censusdata.geographies(censusdata.censusgeo([('state', '*')]), 'acs5', 2018)

In [None]:
#selecting Nevada and getting county names
censusdata.geographies(censusdata.censusgeo([('state', '04'), ('county', '*')]), 'acs5', 2018)


In [None]:
#Downloading all census tract level information for clark county (las vegas 32, 003)
col ={'B02001_001E': 'pop_2018', 'B19013_001E': 'median_income'}
vegas_census = censusdata.download('acs5', 2018,
                             censusdata.censusgeo([('state', '32'), ('county', '003'), ('tract', '*')]) ,
                                   ['B02001_001E','B19013_001E']).reset_index().rename(columns =col)
#Downloading all census tract information for  county Pheonix
phnx_census = censusdata.download('acs5', 2018,
                             censusdata.censusgeo([('state', '04'), ('county', '013'), ('tract', '*')]),
                                   ['B02001_001E','B19013_001E']).reset_index().rename(columns =col)



In [None]:
print(vegas_census.shape)
print(phnx_census.shape)
# 'Median_Income' = B19013_001E, 'Total_Pop' = B02001_001E, 'White_Pop' = B02001_002E, 'African_American_Pop' = B02001_003E,
#          'Native_American_Pop' = B02001_004E, 'Asian_Pop' = B02001_005E, 'Hawain_Pacisld_Pop'= B02001_006E

In [None]:
# Concating vegas and Pheonix
frames = [vegas_census, phnx_census]
census_df = pd.concat(frames)
print(census_df.head())
print(census_df.shape)

In [None]:
#saving as DF for processing
census_df.to_csv('yelp_data/census_df.csv', index =False)
census_df = pd.read_csv('yelp_data/census_df.csv')

In [None]:
#renaming , expanding and dropping columns
census_df = census_df.rename({'index':'census_info'}, axis =1)
census_df = pd.concat([census_df[['pop_2018','median_income']], census_df['census_info'].str.split(',',expand=True)], axis =1)
x={0:'census_tract', 1: 'county'}
census_df =census_df.drop([2,3], axis =1).rename(columns =x)

In [None]:
#removing Census Tract
census_df['census_tract'] = census_df['census_tract'].str.replace('Census Tract ', '')
census_df.head()

In [None]:
census_df = census_df.replace(-666666666,np.nan)
census_df['median_income'].fillna((census_df['median_income'].mean()), inplace=True)
# census_df = census_df.fillna(census_df['median_income'].transform('median'))
#business_resturants = business_resturants.fillna(business_resturants.groupby('name')[impute_list].transform('median'))
census_df.describe()

In [None]:
#census_df = census_df[census_df['median_income']!= 'NaN']

In [None]:
# # To check If object column contains mixed data type 
# census_df['median_income'].apply(type).value_counts()

In [None]:
# #getting shape file for vegas
# shp_path_vegas =('Yelp_data/shape/tl_2019_32_tract.shp')
# sf_vegas=shp.Reader(shp_path_vegas)
# #getting shape file for Pheonix
# shp_path_phx =('Yelp_data/shape_phx/tl_2019_04_tract.shp')
# sf_phx=shp.Reader(shp_path_phx)

In [None]:
# #Changing shape file into pandas data file
# def read_shapefile(sf):
#     """
#     Read a shapefile into a Pandas dataframe with a 'coords' 
#     column holding the geometry information. This uses the pyshp
#     package
#     """
#     fields = [x[0] for x in sf.fields][1:]
#     records = sf.records()
#     shps = [s.points for s in sf.shapes()]
#     df = pd.DataFrame(columns=fields, data=records)
#     df = df.assign(coords=shps)
#     return df
# #applying function to get the df
# df_vegas = read_shapefile(sf_vegas)
# df_phx = read_shapefile(sf_phx)


In [None]:
# #checking numbr of shapes in shape files in
# len(sf_vegas.shapes())
# #exploring one of the shape (or records)
# sf_vegas.records()[1]

# Working with shape file and geo dataframe

In [None]:
#Reading files as geopand dataframe
df_vegas = gpd.read_file('Yelp_data/shape/tl_2019_32_tract.shp')
df_phx = gpd.read_file('Yelp_data/shape_phx/tl_2019_04_tract.shp')

In [None]:
business_resturants = pd.read_csv('yelp_data/business_resturants.csv')

In [None]:
type(df_vegas)

In [None]:
# subsetting to keep county just for Vegas and Pheonix
df_vegas = df_vegas[df_vegas['COUNTYFP']=='003']
df_phx = df_phx[df_phx['COUNTYFP']=='013']

In [None]:
#concating vegas and Pheonix dataframe
frames_shp = [df_vegas, df_phx]
shape_df = pd.concat(frames_shp)


In [None]:
s={'STATEFP':'state', 'NAME':'census_tract', 'GEOID':'geoid'}
shape_df =shape_df.drop(['TRACTCE','NAMELSAD','MTFCC','FUNCSTAT','ALAND','AWATER', 'COUNTYFP', 'INTPTLAT', 'INTPTLON' ], axis =1).rename(columns=s)
# #changing numbers to state initials
shape_df['state'] = shape_df['state'].map({'32': 'NV', '04': 'AR'})


In [None]:
#Merging income from census and geodata for tract, to preserve the geodata we need to keep deo df at left during merge
income_df = shape_df.merge(census_df, on='census_tract')



In [None]:
#rearranging the column order
c_list=['geoid','state','county', 'census_tract', 'pop_2018','median_income', 'geometry' ]
income_df = income_df[c_list]



In [None]:
#Getting state specific info 
# tract_vegas = income_df[income_df['state']=='NV']
# tract_phnx = income_df[income_df['state']=='AR']

In [None]:
tract_vegas = income_df[income_df['state']=='NV']
# fig, ax =plt.subplots(figsize =(12,10))
# tract_vegas.plot(color="Grey", ax=ax);

In [None]:
tract_phnx = income_df[income_df['state']=='AR']
# fig, ax =plt.subplots(figsize =(12,10))
# tract_phnx.plot(color="Grey", ax=ax);

In [None]:
from shapely.geometry import Point
business_resturants= pd.read_csv('yelp_data/business_resturants.csv')
#creating geometry 
business_resturants['geometry'] = business_resturants.apply(lambda x: Point((float(x.longitude),float(x.latitude))),axis =1)

In [None]:
#building geoDataFrame and we want the CRS to match the CRS in the tract_vegas GeoDataFrame
business_geo  =gpd.GeoDataFrame(business_resturants, crs =income_df.crs,geometry = business_resturants['geometry'])

In [None]:
#combining census tract and yelp business geodatas
yelp_census = gpd.sjoin(business_geo, income_df, op ='within')
yelp_census= yelp_census.drop([ 'index_right', 'state_right'], axis =1)
yelp_census['price_range'] = round(yelp_census['price_range'])

In [None]:
#Exporting final business and census copiled file
yelp_census.to_csv('yelp_data/yelp_compiled.csv', index =False)

In [None]:
yelp_census_geo= yelp_census.drop(['romantic', 'intimate', 'touristy', 'hipster', 'divey', 'classy', 'trendy',
                                'upscale', 'casual', 'new_categories'], axis =1)

# Mapping

In [None]:
yelp_census_geo.to_csv('yelp_data/yelp_census_geo.csv', index =False)

In [None]:
yelp_census_geo =pd.read_csv('yelp_data/yelp_census_geo.csv')

In [None]:
fig, ax = plt.subplots(figsize=(12,10))
sns.barplot(x = 'stars', y = 'name', data=top_restaurants, ax= ax);
plt.savefig('top20_restaurants.png')
plt.show()

In [None]:
import plotly.graph_objs as go
import plotly.express as px
from ipywidgets import widgets

In [None]:

px.set_mapbox_access_token("pk.eyJ1IjoiYW5hbWlrYTEyMyIsImEiOiJja2Fpb3dycmgwMnJhMnJuc213YnA4emlrIn0.b_ToBPcogxVFLkqiaI9DmA")

In [None]:
print(px.colors.sequential.Plasma)

In [None]:
#income map for las vega
vegas_geo =yelp_census_geo[yelp_census_geo['state_left']=='NV']
px.scatter_mapbox(vegas_geo, lat="latitude", lon="longitude", color="income_range",
                  color_continuous_scale= px.colors.sequential.Bluered,
                          hover_data= ['name', 'census_tract','median_income', 'category'],size_max=30, zoom=10, width=1200, height=800)


In [None]:
#resturant category map for Las vegas
vegas_geo =yelp_census_geo[yelp_census_geo['state_left']=='NV']
px.scatter_mapbox(vegas_geo, lat="latitude", lon="longitude", color="price_range", 
                  #color_continuous_scale= px.colors.sequential.Bluered,
                          hover_data= ['name', 'census_tract','median_income', 'category'],size_max=30, zoom=8, width=1200, height=800)


In [None]:
#
phnx_geo =yelp_census_geo[yelp_census_geo['state_left']=='AZ']
px.scatter_mapbox(phnx_geo, lat="latitude", lon="longitude", color="income_range",
                  color_continuous_scale= px.colors.sequential.Bluered, 
                  size_max=15, zoom=10, width=1200, height=800)


In [None]:
phnx_geo =yelp_census_geo[yelp_census_geo['state_left']=='AZ']
px.scatter_mapbox(phnx_geo, lat="latitude", lon="longitude", color="price_range",
                  #color_continuous_scale= px.colors.sequential.Bluered,
                  hover_data= ['name', 'income_range','median_income'],size_max=30, zoom=10, width=1200, height=800)


In [None]:
# tract_vegas.crs

In [None]:
#building geoDataFrame and we want the CRS to match the CRS in the tract_vegas GeoDataFrame
vegas_geo = gpd.GeoDataFrame(business_vegas, crs =tract_vegas.crs, geometry =business_vegas['geometry'])

In [None]:
#combining census tract and yelp business geodatas
# yelp_vegas_tract = gpd.sjoin(vegas_geo, tract_vegas, op ='within')
# yelp_vegas_tract.head()

In [None]:
vegas_geo =yelp_census_geo[yelp_census_geo['state_left']=='NV']
px.scatter_mapbox(vegas_geo, lat="latitude", lon="longitude", color="category", 
                   hover_data= ['name', 'census_tract','median_income', 'category'],size_max=30, zoom=10, width=1200, height=800)


In [None]:
vegas_geo =yelp_census[yelp_censu_geos['state_left']=='NV']
px.scatter_mapbox(vegas_geo, lat="latitude", lon="longitude", color="price_range", 
                   hover_data= ['name', 'census_tract','median_income', 'category'],size_max=30, zoom=10, width=1200, height=800)


In [None]:
#vegas_geo =yelp_census[yelp_census['state_left']=='NV'].groupby('census_tract')

In [None]:
vegas_geo.head()

# K Mean clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
K_clusters = range(1,10)
kmeans = [KMeans(n_clusters=i) for i in K_clusters]
Y_axis = vegas_geo[['latitude']]
X_axis = vegas_geo[['longitude']]
score = [kmeans[i].fit(Y_axis).score(Y_axis) for i in range(len(kmeans))]
# Visualize
plt.plot(K_clusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
geo = vegas_geo[['longitude','latitude']]
kmeans = KMeans(n_clusters=4, init='k-means++')
kmeans.fit(geo)   # compute kmeans
y = kmeans.labels_ #labels of each point
vegas_geo['cluster'] = kmeans.predict(vegas_geo[['longitude','latitude']]) # labels of each point


In [None]:
vegas_geo['cluster'] = kmeans.predict(vegas_geo[['longitude','latitude']])
vegas_geo.head()

In [None]:
px.scatter_mapbox(vegas_geo, lat="latitude", lon="longitude", color="cluster",
                  hover_data= ['name', 'census_tract', 'price_range','category'], zoom=10, width=1200, height=800)