In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

plt.rcParams["figure.figsize"] = (12,8)
matplotlib.style.use('ggplot')
matplotlib.rcParams.update({'font.size': 12})

In [None]:
df=pd.read_csv('../input/sanfranciso-crime-dataset/Police_Department_Incidents_-_Previous_Year__2016_.csv')
df

In [None]:
# These values are in 'Location' column
del df['X']
del df['Y']

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df['PdDistrict'] = df['PdDistrict'].fillna('_MISSING_')

First two rows have the same incident number, but different descriptions.
Take a look at this incident:

In [None]:
df[df['IncidntNum']==120058272]

In [None]:
print("Number of incidents, excluding combined: ", len(df['IncidntNum'].drop_duplicates(keep='first')))
print("Start date: {}, end date: {}".format(df.Date.min(), df.Date.max()))

# What

**Incident types:**

In [None]:
df.Category.value_counts()

In [None]:
def hbar(data, textdist, p=None):
    """
    Horizontal bar.
    
    Parameters:
    p: percent if p=1 else num
    """
    ax = data.plot.barh(color='r')
    
    xmin, xmax = plt.xlim()
    plt.xlim(xmin, xmax + 0.05 * (xmax - xmin))

    totals=[]
    for i in ax.patches:
        totals.append(i.get_width())

    total = sum(totals)

    # top 15
    for i in ax.patches[-15:]:
        if p == 1:
            ax.text(i.get_width()+textdist, i.get_y(), str(round((i.get_width()/total)*100, 2))+'%', color='black')
        else:
            ax.text(i.get_width()+textdist, i.get_y(), str(int(i.get_width())), color='black')
                    

In [None]:
hbar(df.Category.value_counts()[::-1], 400, p=1)

# When

**The worst day of week:**

In [None]:
def bbar(data, textdist, col, p=None):

    ax=data.plot.bar(color=col)
    
    ymin, ymax = plt.ylim()
    plt.ylim(ymin, ymax + 0.05 * (ymax - ymin))

    
    totals=[]
    for i in ax.patches:
        totals.append(i.get_height())

    total = sum(totals)
    
    for i in ax.patches:
        if p==1:
            ax.text(i.get_x(), i.get_height()+textdist, str(round((i.get_height()/total)*100, 2))+'%', fontsize=12, color='black')
        else:
            ax.text(i.get_x(), i.get_height()+textdist, str(int(i.get_height())), fontsize=12, color='black')

In [None]:
bbar(df['DayOfWeek'].value_counts(), 400, col = 'lightskyblue', p=1)

**Parts of the Day:**

- Morning     4 am to 12 pm (noon)

Early morning    4 to 7 am
Late morning     11 am to 12pm
 

- Afternoon     12 pm to 5 pm

Early afternoon   1 to 3pm
Late afternoon    4 to 5pm
 

- Evening     5 pm to 11 pm

Early evening   5 to 7 pm

- Night         11 pm to 4 am

In [None]:
df.Time.value_counts().head(10)

In [None]:
# Time values are strings, we need an hour here:
print( df.Time[0], ' -> ', df.Time[0].split(':')[0])

In [None]:
def daytime(x):
    x = x.split(':')[0]
    
    if x in ['04', '05', '06', '07']:
        x = 'Early Morning'
        
    elif x in ['08', '09', '10', '11']: 
        x = 'Morning'
        
    elif x in ['12', '13', '14', '15', '16']: 
        x = 'Afternoon'
    
    elif x in ['17', '18']:
        x = 'Early Evening'
    
    elif x in ['19', '20', '21', '22']: 
        x = 'Evening'
        
    elif x in ['23', '00', '01', '02', '03']: 
        x = 'Night'
    
    return x

In [None]:
df['Time'] = df['Time'].apply(daytime)

In [None]:
bbar(df.Time.value_counts(), 400, 'lightskyblue', p=1)

```12pm - 5pm``` is the most dangerous time? Let's check the types of crime in this daytime.

In [None]:
hbar(df.groupby('Time').get_group('Afternoon')['Category'].value_counts()[::-1], 100)

Perhaps, this is due to the fact that people are more likely to use public transport during the day where larcency and thefts often occur. Shoplifting can also be included into this category.

In [None]:
hbar(df.groupby('Time').get_group('Evening')['Category'].value_counts()[::-1], 100)

Percentage of ```vehicle thefts``` has significantly increased compared to the afternoon.

In [None]:
hbar(df.groupby('Time').get_group('Night')['Category'].value_counts()[::-1], 100)

Night is a favorite time of ```burglars```.

All in all, ```12pm-11pm``` is the time with the most crimes. The results also depend on how the times of day are defined.

**Day of week and daytime:**

In [None]:
g = df.groupby('DayOfWeek')['Time'].value_counts().unstack(fill_value=0)
g = g.reindex(index=['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'],
             columns=['Early Morning', 'Morning', 'Afternoon','Early Evening','Evening','Night'])

x = np.arange(len(g.columns))
y = np.arange(len(g.index))
X, Y = np.meshgrid(x, y)

fig, ax = plt.subplots()
cs = ax.contourf(X, Y, g.values, cmap='viridis')
plt.colorbar(cs)

ax.set_xticks(x) 
ax.set_yticks(y)  

ax.set_xticklabels(g.columns.values)  
ax.set_yticklabels(g.index.values)  #
plt.xticks(rotation=90)

plt.show()

**Max/min day**

In [None]:
w=df.groupby('Date').count()
w[w['IncidntNum']==w['IncidntNum'].max()]

In [None]:
w[w['IncidntNum']==w['IncidntNum'].min()]

**Most crimes were committed on January 1, December 25 has the least amount of crimes.**

# Where

**Most dangerous districts:**

In [None]:
bbar(df.PdDistrict.value_counts()[:-1], 400, col='indigo', p=1) # plot without "_missing_"

The ```southern``` district has the highest number of incidents.

In [None]:
hbar(df.groupby('PdDistrict').get_group('SOUTHERN')['Category'].value_counts()[::-1], 200)

In [None]:
bbar(df.groupby('PdDistrict').get_group('SOUTHERN')['Time'].value_counts(), 200, col='indigo')

**Category and its "favourite" (or one of them) dist.**

In [None]:
df.groupby('Category')['PdDistrict'].value_counts().unstack(fill_value=0).head()

In [None]:
dists={}
for i in range(0,len(df.groupby('Category')['PdDistrict'].value_counts().unstack(fill_value=0).columns)):
    dists[i] = df.groupby('Category')['PdDistrict'].value_counts().unstack(fill_value=0).columns[i]

    
catdist={}
for i in df.Category.unique():
    catdist[i] = dists[df.groupby('Category')['PdDistrict'].value_counts().unstack(fill_value=0).loc[i].argmax()]
    
catdist

In [None]:
import plotly.express as px

fig = px.sunburst(df.groupby(['Category', 'PdDistrict'])['IncidntNum'].count().reset_index(),
    path = ['Category', 'PdDistrict'],
    values = 'IncidntNum',
    height = 700,
)
fig.show()

In most cases, the largest number of crimes occurs in the ```Southern``` district. But some types of crime occur more often in other districts: ```arsons``` and ```vehicle thefts``` are more common in ```Bayview``` dist.