In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from shapely.geometry import Polygon
from tqdm import tqdm

import os

# Walk through and print the files available to us
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# Pandas display more columns
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [20,10] # Change 20 and 10 to change the length and height

In [None]:
# Download the geospatial data of Philadelphia's wards, divisions, and neighborhoods
# This is hosted on github, we will use wget to obtain it
!wget https://github.com/azavea/geo-data/archive/refs/heads/master.zip
## PPD 
# !wget https://opendata.arcgis.com/datasets/62ec63afb8824a15953399b1fa819df2_0.zip
    
# "wget" is a terminal command that basically means "download this"
# Remember we put an exclamation point in front when we want to use a terminal command instead of Python command

# Unzip the zip file
!unzip master.zip
# !unzip 62ec63afb8824a15953399b1fa819df2_0.zip

In [None]:
!wget https://opendata.arcgis.com/datasets/62ec63afb8824a15953399b1fa819df2_0.zip
!unzip 62ec63afb8824a15953399b1fa819df2_0.zip

In [None]:
neighborhoods = gpd.read_file('geo-data-master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.shp')
wards = gpd.read_file('geo-data-master/politcal-wards-divisions/2016/2016_Wards.shp')
wards_divisions = gpd.read_file('geo-data-master/politcal-wards-divisions/2016/2016_Ward_Divisions.shp')

tobacco = pd.read_csv('/kaggle/input/phl-public-data/Tobacco_Youth_Sales_Violations.csv')
ppd = pd.read_csv('/kaggle/input/phl-public-data/ppd_complaints.csv')
crime2020 = gpd.read_file('../input/crime-incidents-zipshp/incidents_part1_part2/incidents_part1_part2.shp')
crime2019 = gpd.read_file('../input/crime-incidents-zipshp/incidents_part1_part2 (1)/incidents_part1_part2.shp')
crime2018 = gpd.read_file('../input/crime-2018/incidents_part1_part2.shp')
schools = gpd.read_file('../input/school-data-shp/Schools.shp')
picnic_tables = gpd.read_file('../input/picnic-sites/ppr_picnic_sites.shp')
nhood_resources = gpd.read_file('../input/neighborhood-resources/NeighborhoodResources.shp')
playgrounds = gpd.read_file('../input/playgrounds/PPR_Playgrounds.shp')
districts = gpd.read_file('./Boundaries_District.shp')

# All of our datasets contain longitude and latitude so let's start mapping these coordinates to neighborhoods in Philadelphia

In [None]:
## Function to map neigborhoods to datasets to using the centroid of an area

def map_neighborhoods(df,neighborhood_df ):
    list_of_results = []
    for point_to_check in tqdm(df['centroids'].values):
        point_found = False
        for neighborhood, neighborhood_name in neighborhood_df[['geometry', 'MAPNAME']].values:
            if point_to_check.within(neighborhood):
                point_found = True
                list_of_results.append(neighborhood_name)
                break # if already found
        if not point_found:
            list_of_results.append(None)
    df['MAPNAME'] = list_of_results
    df.head(10)

In [None]:
# Looking online, we find that the standard name for latitude + longitude is called "EPSG:4326"
neighborhoods_latlong = neighborhoods.to_crs('epsg:4326')


# Tobacco Sales To Minors

In [None]:
tobacco.head()


In [None]:
tobacco_geometry = gpd.GeoDataFrame(tobacco, geometry=gpd.points_from_xy(tobacco['LONGITUDE'], tobacco['LATITUDE'], crs='epsg:4326'))

In [None]:
tobacco_geometry.head(2)

In [None]:
tobacco_geometry.shape, tobacco_geometry.drop_duplicates(['LONGITUDE','LATITUDE']).shape

In [None]:

tobacco_geometry['centroids'] = tobacco_geometry['geometry'].centroid

map_neighborhoods(tobacco_geometry,neighborhoods_latlong)

In [None]:
num_sales_per_neighborhood = tobacco_geometry.groupby('MAPNAME').size().reset_index()
num_sales_per_neighborhood.columns = ['MAPNAME','num_tobacco_sales_to_minors']
num_sales_per_neighborhood.head(10)

# PPD Complaints & Districts

In [None]:
ppd

In [None]:

districts['centroids'] = districts['geometry'].centroid
map_neighborhoods(districts,neighborhoods_latlong)

In [None]:
# base_map = districts['geometry'].plot(color='white', edgecolor='black')
# districts['centroids'].plot(ax=base_map, marker='o', color='red', markersize=5)

In [None]:
ppd

In [None]:
districts.head(1)

In [None]:
ppd['district_occurrence'].drop_duplicates().sort_values()

In [None]:
# Add '00' at the end of the districts column
districts['DIST_NUMC'] = districts['DIST_NUMC'] + '00'

In [None]:
# Update the rows where ppd has '09'
ppd.loc[ppd['district_occurrence'] == '09', 'district_occurrence'] = '0900'

In [None]:
print(ppd.shape)
ppd = ppd.merge(districts, left_on='district_occurrence', right_on='DIST_NUMC', how='left')
print(ppd.shape)

In [None]:
num_complaints_per_neighborhood = ppd.groupby('MAPNAME').size().reset_index()
num_complaints_per_neighborhood.columns = ['MAPNAME', 'num_complaints_per_neighborhood']

num_complaints_per_neighborhood

# Crime

In [None]:
crime2020.head(1)
 


In [None]:
crime2019.head(1)

In [None]:
crime2018.head(1)

In [None]:
# Columns are identical but when merging all 3 datframes new columns were created due to spaces in column names for some dataframes and not others 
crime2019.columns = crime2020.columns
crime2018.columns = crime2020.columns

In [None]:
# CREATE CENTROID COLUMN
crime2020['centroids'] = crime2020['geometry'].centroid
crime2019['centroids'] = crime2019['geometry'].centroid
crime2018['centroids'] = crime2018['geometry'].centroid



In [None]:
crime2020 = crime2020.drop(crime2020.loc[crime2020['centroids'].isnull()].index)
crime2019 = crime2019.drop(crime2019.loc[crime2019['centroids'].isnull()].index)
crime2018 = crime2018.drop(crime2018.loc[crime2018['centroids'].isnull()].index)

In [None]:
crime = crime2020.append([crime2019, crime2018], ignore_index=True)

In [None]:
map_neighborhoods(crime,neighborhoods_latlong)

In [None]:
crime.shape

In [None]:
## A little over 3000 rows didn't map to a neighborhood 
crime.loc[crime['MAPNAME'].isnull()]

In [None]:
## Drop rows with no neighborhood 
crime = crime.loc[crime['MAPNAME'].isnull() == False]
crime.shape

In [None]:
crime_by_nhood = crime.groupby('MAPNAME').size().reset_index()
crime_by_nhood.columns = ['MAPNAME', 'crimes_per_neighborhood']

crime_by_nhood


# Schools

In [None]:
schools.head(1)

In [None]:
schools['centroids'] = schools['geometry'].centroid 


In [None]:
map_neighborhoods(schools,neighborhoods_latlong)

In [None]:
schools_per_nhood = schools.groupby('MAPNAME').size().reset_index()
schools_per_nhood.columns = ['MAPNAME', 'schools_per_neighborhood']
schools_per_nhood.head(5)

# Picnic Tables

In [None]:
picnic_tables.head(5)

In [None]:
picnic_tables['centroids'] = picnic_tables['geometry'].centroid



In [None]:
map_neighborhoods(picnic_tables,neighborhoods_latlong)

In [None]:
tables_per_nhood = picnic_tables.groupby('MAPNAME').size().reset_index()
tables_per_nhood.columns = ['MAPNAME', 'picnic_tables_per_neighborhood']

# Neighborhood Resource Groups

In [None]:
nhood_resources.head(5)

In [None]:
nhood_resources['centroids'] = nhood_resources['geometry'].centroid


In [None]:
map_neighborhoods(nhood_resources,neighborhoods_latlong)

In [None]:
resources_per_nhood = nhood_resources.groupby('MAPNAME').size().reset_index()
resources_per_nhood.columns = ['MAPNAME', 'resource_groups_per_neighborhood']

In [None]:
# resources_per_nhood.sort_values(ascending=False, by='resource_groups_per_neighborhood').head(10).plot.bar()

# Playgrounds

In [None]:
playgrounds.head(5)

In [None]:
playgrounds['centroids'] = playgrounds['geometry'].centroid

In [None]:
map_neighborhoods(playgrounds,neighborhoods_latlong)

In [None]:
playgrounds_per_nhood = playgrounds.groupby('MAPNAME').size().reset_index()
playgrounds_per_nhood.columns = ['MAPNAME', 'playgrounds_per_neighborhood']

# Everything on a neighborhood level -- Let's merge all the neighborhood level data into one dataframe

In [None]:
nhood_stats = num_complaints_per_neighborhood.merge(num_sales_per_neighborhood,  on='MAPNAME', how='outer' )
nhood_stats = nhood_stats.merge(crime_by_nhood, on='MAPNAME', how='outer')
nhood_stats = nhood_stats.merge(schools_per_nhood, on='MAPNAME', how='outer')
nhood_stats = nhood_stats.merge(tables_per_nhood, on='MAPNAME', how='outer')
nhood_stats = nhood_stats.merge(resources_per_nhood, on='MAPNAME', how='outer')
nhood_stats = nhood_stats.merge(playgrounds_per_nhood, on='MAPNAME', how='outer')





In [None]:
nhood_stats['num_tobacco_sales_to_minors'] = nhood_stats['num_tobacco_sales_to_minors'].fillna(0).astype(int)
nhood_stats['num_complaints_per_neighborhood'] = nhood_stats['num_complaints_per_neighborhood'].fillna(0).astype(int)
nhood_stats['crimes_per_neighborhood'] = nhood_stats['crimes_per_neighborhood'].fillna(0).astype(int)
nhood_stats['schools_per_neighborhood'] = nhood_stats['schools_per_neighborhood'].fillna(0).astype(int)
nhood_stats['picnic_tables_per_neighborhood'] = nhood_stats['picnic_tables_per_neighborhood'].fillna(0).astype(int)
nhood_stats['resource_groups_per_neighborhood'] = nhood_stats['resource_groups_per_neighborhood'].fillna(0).astype(int)
nhood_stats['playgrounds_per_neighborhood'] = nhood_stats['playgrounds_per_neighborhood'].fillna(0).astype(int)

In [None]:
nhood_stats.head(10)

# Add sales per area to dataframe 

In [None]:
nhood_stats = nhood_stats.merge(neighborhoods_latlong[['Shape_Area','MAPNAME']], on='MAPNAME', how='left')
nhood_stats['area'] = nhood_stats['num_tobacco_sales_to_minors'] / nhood_stats['Shape_Area']
top_sales_by_area = nhood_stats.sort_values(ascending=False, by='area').head(10)
top_sales_by_area['MAPNAME']



In [None]:
tobacco.groupby('MAPNAME').size().sort_values(ascending=False).head(10).plot.bar(color=['red','blue', 'green', 'yellow', 'cyan', 'lavender', 'black', 'purple', 'gray', 'tan'])
plt.title('10 Neighborhoods With Most Tobacco Sales To Minors', fontsize=20)
plt.xlabel('Neighborhood Name', fontsize=18)
plt.ylabel('Number of Violations', fontsize=18)

In [None]:
tobacco.groupby('LEGAL_NAME').size().sort_values(ascending=False).head(10).plot.bar()
plt.title('10 Stores With Violations', fontsize=20)
plt.xlabel('Store Legal Name', fontsize=18)

In [None]:
tobacco_type = tobacco['TOBACCO_TYPE'].value_counts()

y = tobacco_type.values
labels = tobacco_type.index

plt.pie(y, labels = labels, startangle = 90)
plt.title('Cigars make up 98 percent of all violations to minors\n', fontsize=22)
plt.legend()

plt.show()


# Let's explore what the distribution of tobacco sales across neighborhoods look like

In [None]:
plt.hist(nhood_stats['num_tobacco_sales_to_minors'])
plt.title('Distribution of Tobacoo Sales To Minors In Neighborhoods', fontsize=20)
plt.ylabel('Frequency', fontsize=18)
plt.show()




# Now let's check the rest of our variables 

In [None]:

#plot 1:
x = nhood_stats['crimes_per_neighborhood']


plt.subplot(3, 2, 1)
plt.hist(x)

plt.title("CRIMES", fontsize=18)

#plot 2:
x = nhood_stats['schools_per_neighborhood']


plt.subplot(3, 2, 2)
plt.hist(x)

plt.title("SCHOOLS", fontsize=18)

#plot 3:
x = nhood_stats['picnic_tables_per_neighborhood']


plt.subplot(3, 2, 3)
plt.hist(x)

plt.title("PICNIC TABLES", fontsize=18)


#plot 4:
x = nhood_stats['resource_groups_per_neighborhood']


plt.subplot(3, 2, 4)
plt.hist(x)

plt.title("RESOURCE GROUPS", fontsize=18)

#plot 5:
x = nhood_stats['num_complaints_per_neighborhood']


plt.subplot(3, 2, 5)
plt.hist(x)

plt.title("PPD COMPLAINTS", fontsize=18)

#plot 6:
x = nhood_stats['playgrounds_per_neighborhood']
y = nhood_stats['num_tobacco_sales_to_minors']

plt.subplot(3, 2, 6)
plt.hist(x)

plt.title("PLAYGROUNDS", fontsize=18)

plt.suptitle("DISTRIBUTIONS OF VARIABLES ACROSS NEIGHBORHOODS\n", fontsize=20)
plt.tight_layout()
plt.show()

# Let's see if we can spot any trends between tobacco sales to minors and the other variables

In [None]:
#plot 1:
x = nhood_stats['crimes_per_neighborhood']
y = nhood_stats['num_tobacco_sales_to_minors']


plt.subplot(3, 2, 1)
plt.scatter(x,y)

plt.title("CRIMES", fontsize=18)

#plot 2:
x = nhood_stats['schools_per_neighborhood']
y = nhood_stats['num_tobacco_sales_to_minors']


plt.subplot(3, 2, 2)
plt.scatter(x,y)

plt.title("SCHOOLS", fontsize=18)

#plot 3:
x = nhood_stats['picnic_tables_per_neighborhood']
y = nhood_stats['num_tobacco_sales_to_minors']


plt.subplot(3, 2, 3)
plt.scatter(x,y)

plt.title("PICNIC TABLES", fontsize=18)


#plot 4:
x = nhood_stats['resource_groups_per_neighborhood']
y = nhood_stats['num_tobacco_sales_to_minors']


plt.subplot(3, 2, 4)
plt.scatter(x,y)

plt.title("RESOURCE GROUPS", fontsize=18)

#plot 5:
x = nhood_stats['num_complaints_per_neighborhood']


plt.subplot(3, 2, 5)
plt.scatter(x,y)

plt.title("PPD COMPLAINTS", fontsize=18)

#plot 6:
x = nhood_stats['playgrounds_per_neighborhood']
y = nhood_stats['num_tobacco_sales_to_minors']

plt.subplot(3, 2, 6)
plt.scatter(x,y)

plt.title("PLAYGROUNDS", fontsize=18)

plt.suptitle("VARIABLES IN RELATION TO TOBACCO SALES TO MINORS\n", fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
ab = nhood_stats.corr()
ab['num_tobacco_sales_to_minors'].sort_values(ascending=False).reset_index().rename(columns={'index':'observation', 'num_tobacco_sales_to_minors':'correlation to tobacco sales to'})

# Crimes per neighborhood seems to have a strong positve correlation to the sales of tobacco to minors. Lets examine this data more closely 

In [None]:
base_map = neighborhoods_latlong['geometry'].plot(color='white', edgecolor='black')
tobacco_geometry['centroids'].plot(ax=base_map, marker='o', color='red', markersize=5)
plt.title('Tobacco Sales To Minors', fontsize=20)


base_map2 = neighborhoods_latlong['geometry'].plot(color='white', edgecolor='black')
crime['centroids'].plot(ax=base_map2, marker='o', color='red', markersize=5)
plt.title('Crimes Per Neighborhood', fontsize=20)

plt.show()

In [None]:
crime.groupby(['text_gener','MAPNAME']).size().sort_values(ascending=False).head(10).plot.bar()
plt.title('Top 10 Crimes/Neighborhoods', fontsize=20)

# Some summary statistics on the Neighborhood Stats (nhood_stats) dataframe

In [None]:
nhood_stats.describe()

In [None]:
# We need to loop through all of our rows to add markers to the map
philly_map = folium.Map(location = [39.952583, -75.165222], tiles = "Stamen Terrain", zoom_start = 11)
# Don't plot all of them... my computer crashes if I try to
# It is over 4,000 markers - and over 1,000 if we use groupby
# Let's plot every 10th marker
tobacco_counts = tobacco_geometry.groupby(['LATITUDE','LONGITUDE','LEGAL_NAME','TRADE_NAME','RETAILER_TYPE','ADDRESS_LINE_1']).size().to_frame('num_violations').reset_index(drop=False)
for row_idx, row in tqdm(tobacco_counts.iterrows(), total=len(tobacco_geometry)):
    if row_idx % 10 != 0:
        continue
    lat = row['LATITUDE']
    long = row['LONGITUDE']
    the_numvio = "Number of Violations: " + str(row['num_violations'])
    the_legal_name = "Legal Name: " + str(row['LEGAL_NAME'])
    the_trade_name = "Trade Name: " + str(row['TRADE_NAME'])
    the_retailer = "Retailer Type: " + str(row['RETAILER_TYPE'])
    the_address = "Address: " + str(row['ADDRESS_LINE_1'])
    full_text = the_numvio + "<br>" + \
                the_legal_name + "<br>" + \
                the_trade_name + "<br>" + \
                the_retailer + "<br>" + \
                the_address
    folium.Marker([lat, long], popup=full_text).add_to(philly_map)
philly_map