In [None]:
import pandas as pd
import numpy as np

In [None]:
from bokeh.plotting import figure, show
from bokeh.models import NumeralTickFormatter, ColumnDataSource
from bokeh.palettes import mpl,Spectral8,magma
from bokeh.core.properties import value 
from bokeh.layouts import row,column,WidgetBox

In [None]:
from bokeh.io import output_notebook,push_notebook,show,curdoc
output_notebook()

## Data Aquisition and Reading

In [None]:
pd.set_option('display.notebook.repr.html',True)

In [None]:
!wc -l data.csv

The downloaded data is stored in Data.csv

In [None]:
df_raw1=pd.read_csv('../Data.csv',nrows=1107513,parse_dates=[[4,5]],index_col=0,usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
df_raw2=pd.read_csv('../Data.csv',header=None,skiprows=1107514,parse_dates=[[4,5]],index_col=0,usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
col_names=df_raw1.columns.to_list()
df_raw2.columns=col_names
df_raw2.index.name=df_raw1.index.name='Date'
df_raw=pd.concat([df_raw1,df_raw2])
df_raw.to_pickle(path='Data.pkl')

In [None]:
#df_raw=pd.read_pickle('data.pkl')

In [None]:
df_raw.head()

## Data cleaning

In [None]:
df_raw.isnull().sum()

In [None]:
df_raw[df_raw['PdDistrict'].isnull()]

In [None]:
df_raw['PdDistrict'].fillna('SOUTHERN',inplace=True)

In [None]:
df_raw['Category']=df_raw['Category'].str.title()
df_raw['Descript']=df_raw['Descript'].str.title()
df_raw['PdDistrict']=df_raw['PdDistrict'].str.title()
df_raw['Resolution']=df_raw['Resolution'].str.title()

In [None]:
df_app=df_raw[['Category','DayOfWeek','PdDistrict','Resolution','PdId']]
df_app.to_pickle(path='appdata.pkl')

## Data Analysis and Visualization

#### The complete user-interactive visualization is available in the main.py file

#### Bar Graphs for different variables

In [None]:
def create_group_bar(var):
    """
    Create a bar graph of number of crimes versus given variable
    """
    
    df_grouped=df_raw.groupby(var)[['PdId']].count().sort_values('PdId',ascending=False)
    x=df_grouped.index.tolist()

    my_palette=magma(len(df_grouped))
    bar=figure(title=f'Bar graph of crimes wrt {var}', height=400, width=700, x_range=x)
    bar.vbar(x=x, top=df_grouped['PdId'], bottom=0, width=0.5, color=my_palette)
    bar.xaxis.axis_label=f'{var}'
    bar.yaxis.axis_label='Number of crimes'
    bar.xaxis.major_label_orientation=45
    bar.yaxis.formatter=NumeralTickFormatter(format='0,0')
    show(bar)

In [None]:
create_group_bar('DayOfWeek')

In [None]:
create_group_bar('Category')

In [None]:
create_group_bar('PdDistrict')

In [None]:
create_group_bar('Resolution')

#### Some numbers w.r.t. 'Resolution'
##### 'Resolution' is broadly categorized into 'None', 'Adult' and 'Juvenile' 

In [None]:
none_mask=df_raw['Resolution'].str.contains('None')
none_reso=(len(df_raw[none_mask])/len(df_raw))*100
print(f'The percentage of None resolutions is {none_reso:0.2f}')

In [None]:
juv_mask=df_raw['Resolution'].str.contains('Juvenile')
df_juv=df_raw[juv_mask]
df_adult=df_raw[~((juv_mask) | (none_mask))]

juv_reso=(len(df_juv)/len(df_adult)) *100
print(f'The percentage of crimes committed by juveniles wrt adults are {juv_reso:0.2f}')

In [None]:
df_none=df_raw[none_mask]
pct_func_none=lambda g: (g['PdId'].count()/len(df_none)) * 100
df_none_pct=df_none.groupby('PdDistrict').apply(pct_func_none).to_frame()

In [None]:
pct_func_all=lambda g: (g['PdId'].count()/len(df_raw)) * 100
df_dist_pct=df_raw.groupby('PdDistrict').apply(pct_func_all).to_frame().rename(columns={0:'All_crime'})
df_dist_pct['Crime_none_reso']=df_none_pct[0]
df_dist_pct=df_dist_pct.style.format("{:.2f}")

#### Percentage of crime between districts and percentage of crimes with 'None' Resolution between districts

In [None]:
df_dist_pct

#### Adult Vs Juvenile

In [None]:
df_adjuv=pd.merge(df_adult, df_juv, how='outer', right_index=True, left_index=True, suffixes=('_adult','_juv')).fillna(0)
df_monthly=df_adjuv.resample('M').sum()

In [None]:
line_adjuv=figure(title='Line graph of Adults Vs Juvenile crimes',height=500, width=750,x_axis_type='datetime')
x = df_monthly.index.tolist()
line_adjuv.line(x, df_monthly['PdId_adult'], line_color='red')
line_adjuv.line(x, df_monthly['PdId_juv'], line_color='green')
line_adjuv.xaxis.axis_label='Date'
line_adjuv.yaxis.axis_label='Number of crimes'
show(line_adjuv)

#### None vs Adult vs Juvenile crime among selected districts and categories

In [None]:
dist=pd.Series(df_raw.groupby('PdDistrict')['PdId'].count().sort_values(ascending=False).index)
cat=pd.Series(df_raw.groupby('Category')['PdId'].count().sort_values(ascending=False).index)
print(dist)
print(cat)

In [None]:
df_naj=df_raw[(df_raw['PdDistrict'].isin(dist[[0,4]])) & (df_raw['Category'].isin(cat[[0,3,5]]))]
n_mask=df_naj['Resolution'].str.contains('None')
j_mask=df_naj['Resolution'].str.contains('Juvenile')
df_n=df_naj[n_mask].resample('M')[['PdId']].count()
df_a=df_naj[~((n_mask) | (j_mask))].resample('M')[['PdId']].count()
df_j=df_naj[j_mask].resample('M')[['PdId']].count()

x = df_n.index.tolist()
line_naj=figure(title='Line graph of None Vs Adults Vs Juvenile crimes',height=500, width=750, 
                x_axis_type='datetime')
line_naj.line(x, df_n['PdId'], color='Red', legend='None')
line_naj.line(x, df_a['PdId'], color='Green', legend='Adult')
line_naj.line(x, df_j['PdId'], color='Blue', legend='Juvenile')
line_naj.xaxis.axis_label='Date'
line_naj.yaxis.axis_label='Number of Crimes'
show(line_naj)

#### Districts vs Crime Categories

In [None]:
top10_category=df_raw.groupby('Category')['PdId'].count().nlargest(10).index.tolist()
top5_district=df_raw.groupby('PdDistrict')['PdId'].count().nlargest(5).index.tolist()

df_catdist=df_raw[(df_raw['Category'].isin(top10_category)) & 
                  (df_raw['PdDistrict'].isin(top5_district))
                 ].groupby(['Category','PdDistrict'])['PdId'].count().unstack()

x=df_catdist.index.tolist()
catdist_line=figure(title='Top 10 crimes in the top 5 districts of SF', width=700, height=500, x_range=x)
color=['red','blue','green','orange','gold']
for i,name in enumerate(df_catdist.columns.tolist()):
    catdist_line.line(x, df_catdist[name], color=color[i], line_width=2, legend=name)
catdist_line.xaxis.major_label_orientation=45
catdist_line.yaxis.formatter=NumeralTickFormatter(format='0,0')
show(catdist_line)