In [None]:
!pip install geopandas
!pip install contextily
!pip install seaborn as sns
!pip install folium
!pip install sodapy
!pip install cartopy

import zipfile
import geopandas as gpd
import contextily as ctx
from shapely.geometry import Point
import pandas as pd
import folium
import numpy as np
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as mdates
import seaborn as sns
import plotly.express as px
import pandas as pd
from sodapy import Socrata
import string
import cartopy
import cartopy.crs as crs

In [None]:
#!/usr/bin/env python

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.sfgov.org", None)

results = client.get("wg3w-h783", limit=1000000)

# Convert to pandas DataFrame
df1 = pd.DataFrame.from_records(results)
df = df1

In [None]:
df.columns

In [None]:
df.drop(columns=[':@computed_region_jwn9_ihcz', ':@computed_region_26cr_cadq', ':@computed_region_qgnn_b9vv',
                ':@computed_region_nqbw_i6c3', ':@computed_region_h4ep_8xdi',':@computed_region_n4xg_c4py',
                ':@computed_region_jg9y_a9du', 'point'], inplace = True)

In [None]:
df['incident_datetime']=pd.to_datetime(df['incident_datetime'], errors = 'coerce')
df['incident_year']=pd.to_datetime(df['incident_year'], errors = 'coerce')
df['incident_date']=pd.to_datetime(df['incident_date'], errors = 'coerce')
df['report_datetime']=pd.to_datetime(df['report_datetime'], errors = 'coerce')

In [None]:
df.dtypes

In [None]:
def o_str(value):
    return str(value)

In [None]:
def o_date(value):
    return str(value)

In [None]:
def o_numeric(value):
    return float(value)

In [None]:
df['incident_id']=df['incident_id'].apply(o_numeric)
df['row_id']=df['row_id'].apply(o_numeric)
df['incident_code']=df['incident_code'].apply(o_numeric)
df['incident_number']=df['incident_number'].apply(o_numeric)
df['cad_number']=df['cad_number'].apply(o_numeric)
df['cnn']=df['cnn'].apply(o_numeric)

df['report_type_description']=df['report_type_description'].apply(o_str)
df['incident_category']=df['incident_category'].apply(o_str)
df['incident_subcategory']=df['incident_subcategory'].apply(o_str)
df['resolution']=df['resolution'].apply(o_str)
df['police_district']=df['police_district'].apply(o_str)
df['analysis_neighborhood']=df['analysis_neighborhood'].apply(o_str)

df['latitude']=df['latitude'].apply(o_numeric)
df['longitude']=df['longitude'].apply(o_numeric)

In [None]:
df.resolution.unique()

In [None]:
df.police_district.unique()

In [None]:
df.report_type_description.unique()

In [None]:
df.analysis_neighborhood.unique()

In [None]:
# Filling in the NA values with In Person, as specified on the website, 
# any row which does not have Filled Online it is assumed that the complaint was filed in person

df['filed_online'].fillna('In Person',inplace = True)

In [None]:
df.dtypes

In [None]:
#Checking null values for each column
df.isnull().sum()

In [None]:
#Since over 70% of the data is null drop this columns

df.cad_number.fillna('0', inplace = True)
df.intersection.fillna('0', inplace = True)
df.cnn.fillna('0', inplace = True)
df.supervisor_district.fillna('0', inplace = True)


df['latitude'].fillna(float(df['latitude'].mean()), inplace=True)
df['longitude'].fillna(float(df['longitude'].mean()), inplace=True)

df.analysis_neighborhood.fillna('0', inplace = True)

In [None]:
#Reindexing and finding shape of dataframe
df.iloc[:].reindex()
df.shape

In [None]:
df.rename(columns={'incident_category': 'Incident Category'}, inplace = True)

## This graph shows us the Count for Number of Crimes occuring in the SFO region based on Days

In [None]:
# Barplot showing major crimes in San Francisco

plt.rcParams['figure.figsize'] = (20, 9)
plt.style.use('fast')
sns.countplot(df['Incident Category'], palette = 'hot')

plt.title('Major Crimes in San Francisco', fontweight = 20, fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

## Distribution of crimes in San Francisco as per days of week

In [None]:
# Distribution of crimes in San Francisco as per days of week

df['incident_day_of_week'].value_counts().plot.pie(figsize = (10, 20), explode = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), autopct='%1.1f%%')
plt.axis('off')
plt.title('Crime count on each day',fontsize = 20)
centre_circle = plt.Circle((0,0),0.6,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.xticks(rotation = 90)
plt.show()

In [None]:

# Get a count of the crimes by neighborhood, returns a series
crime_neighbourhood = df.police_district.value_counts()
# Stick it into a data frame.
neighbour_df = pd.DataFrame(crime_neighbourhood)

# Set up the data frame with the required structure.
neighbour_df.index.name = 'Neighborhood'
neighbour_df.reset_index(inplace=True)
neighbour_df.rename(columns={'police_district': 'No of Crimes'}, inplace=True)
neighbour_df

## Distribution of Number of Crimes that has occured over the period of last 4 years

In [None]:
x = df.groupby(['Incident Category', 'incident_year'])[['incident_year']].count()
x.rename(columns={"incident_year":"count"}, inplace=True)
crime_by_years = x.reset_index()
crime_by_years

In [None]:
df['Incident Category'] = df['Incident Category'].replace(['Motor Vehicle Theft?', 'Other Miscellaneous', 'Other Offenses', 'Weapons Offence'], 
                                                          ['Motor Vehicle Theft', 'Other', 'Other', 'Weapons Offense'])

df['incident_subcategory'] = df['incident_subcategory'].replace(['Motor Vehicle Theft?', 'Other Miscellaneous', 'Other Offenses', 'Weapons Offence'], 
                                                          ['Motor Vehicle Theft', 'Other', 'Other', 'Weapons Offense'])

In [None]:
fig = px.bar(crime_by_years, x='incident_year', y='count', color='Incident Category', 
            title="Counts of crimes according to categories grouped over the years",
            labels={'incident_year':'Year', 'count':'Count of crimes'})
fig.show('notebook')

## Average delay time between the Incident happening and the Report time

In [None]:
df['time_difference'] = round(((df['report_datetime'] - df['incident_datetime']).dt.total_seconds() / 60 / 60 / 24),0)
df[['time_difference']]

In [None]:
x = df.groupby(['incident_year'])['time_difference'].mean()
time_diff_by_year = x.reset_index()
time_diff_by_year

In [None]:
fig = px.line(time_diff_by_year, x='incident_year', y='time_difference',
              title="Average Time difference between incident and report time in days over the years",
              labels={'incident_year':'Year', 'time_difference':'Time difference in days'})
fig.show()

## Bargraph showing distribution of incidences as per the time of its occurance

In [None]:
df['incident_time']=df['incident_time'].astype('string')

In [None]:
df['incident_time']=df['incident_time'].str.split(':',expand=True)[0]
df['incident_time']=df['incident_time'].astype(int)

In [None]:
df['incident_time_category']='null'
df['incident_time_category'].dtypes

In [None]:
# 00.00 - 06.00 : Early Morning, 6.01 - 12.00 : Morning, 12.01 - 18.00 : Evening, 18.01 - 24.00 : Night

In [None]:
def category(x):
    if 0<=x<=6:
        return 'Early Morning'
    elif 6<x<=12:
        return 'Morning'
    elif 12<x<=18:
        return 'Evening'
    elif 18<x<24:
        return 'Night'

In [None]:
df['incident_time_category'] = df['incident_time'].apply(category)

In [None]:
df_count = df.groupby(['incident_time_category']).count()
df_count

In [None]:
df_count = df.groupby(['incident_time_category']).count()
df_num=pd.DataFrame(df_count)
df_num.reset_index(inplace=True)

fig = plt.figure(figsize = (10, 7))
fig = px.bar(df_num.sort_values( 'incident_time',ascending = False), 
            x='incident_time_category', 
            y='incident_time',
            labels = {"incident_time_category":"Time when the incident occured", "incident_time":"Number of incidences" }, 
            title = "Distribution of incidences as per the time of its occurance",
            color='incident_time_category', 
            color_discrete_map={'Evening': 'red','Night': 'blue','Morning': 'green','Early Morning': 'orange'})

fig.show()

Most of the incidents take place in the evening (between 12.01 to 18.00 hours) followed by night (between 18.01 to 24.00 hours), morning (between 06.01 to 12.00 hours) and early morning (between 00.00 to 06.00 hours).

In [None]:
df2 = df.groupby(['Incident Category','incident_subcategory']).count().reset_index()
df2.rename(columns = {'incident_id':'count'}, inplace = True)
df2

## Heat Map to understand which category has the most number of crimes

In [None]:
fig = px.treemap(df2, path=[px.Constant("all"), 'Incident Category', 'incident_subcategory'], values='count')
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

## Incident Hotspots on the map of San Francisco

In [None]:
fig, ax = plt.subplots(figsize=(35,35))
ax = plt.axes(projection=crs.PlateCarree())
ax.coastlines()
ax.set_extent([-122.6, -122.3, 37.68, 37.85])
ax.gridlines(draw_labels=True)

plt.scatter(x=df.longitude, y=df.latitude, 
            color="red", s=20, transform=crs.PlateCarree())

plt.show()

In [None]:
# Get a count of the crimes by neighborhood, returns a series
crime_neighbourhood = df.police_district.value_counts()
# Stick it into a data frame.
neighbour_df = pd.DataFrame(crime_neighbourhood)

# Set up the data frame with the required structure.
neighbour_df.index.name = 'neighborhood'
neighbour_df.reset_index(inplace=True)
neighbour_df.rename(columns={'police_district': 'No of Crimes'}, inplace=True)
neighbour_df

## San Francisco Map using Folium

In [None]:
# San Francisco latitude and longitude values
lat = 37.77
long = -122.42

# Create map using folium
sf1_map = folium.Map(location=[lat, long], zoom_start=12)

# display the map of San Francisco
sf1_map

## Plotting Neighbourhoods on San Francisco Map

In [None]:
# Read in the geojson file with the neighborhoods and lat/long 
#sf_geo  = r'SanFrancisco.Neighborhoods.json'
sf_geo = r"https://cocl.us/sanfran_geojson"

# Create the map, centered on San Francisco with zoom level 12. 
SF_map = folium.Map(location=[37.7749, -122.4194], zoom_start=12)

# Add the choropleth to the map, with the wanted options.
SF_map.choropleth(
    geo_data=sf_geo,
    name='Choropleth',
    data = neighbour_df,
    columns=['neighborhood','No of Crimes'], 
    key_on='feature.properties.DISTRICT', 
    fill_color = 'YlOrRd',
    fill_opacity = 0.7,
    line_opacity = 0.2,
    legend_name = 'Crimes in SF'
)

# creating a state indexed version of the dataframe so we can lookup values
# neighbor_index = neighbour_df.set_index('neighborhood')
  
# looping thru the geojson object and adding a new property(unemployment)
# and assigning a value from our dataframe
# for s in cp.geojson.data['features']:
#     s['properties']['unemployment'] = state_data_indexed.loc[s['id'], 'Unemployment']
  
#   # and finally adding a tooltip/hover to the choropleth's geojson
# folium.GeoJsonTooltip(['name', 'unemployment']).add_to(cp.geojson)
  
# folium.LayerControl().add_to(m)

# folium.LayerControl().add_to(SF_map)
SF_map

References

 - https://stackoverflow.com/questions/70471888/text-as-tooltip-popup-or-labels-in-folium-choropleth-geojson-polygons

 - https://stackoverflow.com/questions/46223224/matplotlib-plot-countplot-for-two-or-more-column-on-single-plot

 - https://matplotlib.org/stable/tutorials/introductory/customizing.html

 - https://medium.com/@kvnamiparaa-better-visualisation-of-pie-charts-by-matplotlib-935b7667d77f

 - https://scitools.org.uk/cartopy/docs/latest/matplotlib/feature_interface.html
 
 - https://plotly.com/python/bar-charts/