### Import the libraries.

In [None]:
import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium import plugins
import plotly.express as px

from sklearn.neighbors import KNeighborsClassifier

from wordcloud import WordCloud

!pip install alphashape
import alphashape

%matplotlib inline
sns.set()

### Get the DataFrame

In [None]:
dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

data = pd.read_csv('../input/crimes-in-boston/crime.csv', encoding='latin-1',
                   parse_dates=['OCCURRED_ON_DATE'], date_parser=dateparse)

### Lets look at the data 

In [None]:
data.head(5)

In [None]:
data.info()

Lets look at the null values, and look at the values. 

In [None]:
data.isna().sum()

In [None]:
data.describe()

Lets look at unique values and their size

In [None]:
for column in data:
    print(f'{column}: {data[column].unique().size}')

Lets look at districts with Shootings. 

In [None]:
for column in ['DISTRICT', 'SHOOTING', 'YEAR', 'UCR_PART']:
    print(f'{column}: {data[column].unique()}')

### Data cleanup

Drop the unwated data

In [None]:
data.drop(['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_DESCRIPTION', 'Location'], axis=1, inplace=True)

Make the data more legible

In [None]:
rename = {'OFFENSE_CODE_GROUP': 'Group',
          'DISTRICT': 'District',
          'REPORTING_AREA': 'Area',
          'SHOOTING': 'If_shooting',
          'OCCURRED_ON_DATE': 'Date',
          'YEAR': 'Year',
          'MONTH': 'YMonth',
          'DAY_OF_WEEK': 'WDay',
          'HOUR': 'DHour',
          'UCR_PART': 'UCR_part',
          'STREET': 'Street',
          'Long': 'Lon'}

data.rename(index=str, columns=rename, inplace=True)

Lets categorize the data

In [None]:
data.WDay = pd.Categorical(data.WDay,
                           categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
                           ordered=True)

Get the coordinates fixed. 

In [None]:
data.Lat.replace(-1, None, inplace=True)
data.Lon.replace(-1, None, inplace=True)

Get the dates fixed

In [None]:
data['YDay'] = data['Date'].dt.dayofyear
data['Mday'] = data['Date'].dt.day

Here's what cleaned up data looks like

In [None]:
data.head(5)

In [None]:
data.columns

    ### Data Visualisation

#### Districts data

In [None]:
data_counties = data[['District', 'Lat', 'Lon']].dropna()

Lets view the nearest neighbours

In [None]:
neigh = KNeighborsClassifier(n_neighbors=500, n_jobs=-1)
neigh.fit(data_counties[['Lat', 'Lon']], data_counties['District'])

Assign new districts

In [None]:
data_counties['District'] = neigh.predict(data_counties[['Lat', 'Lon']])

Fix borders

In [None]:
district_groups = data_counties.groupby(['District'])
geojson = {'type': 'FeatureCollection'}
geojson['features'] = []

for district, data_district in dict(list(district_groups)).items():
    hull_curr = list(alphashape.alphashape(data_district[['Lon', 'Lat']].values,
                                           alpha=np.sqrt(data_district.shape[0]) * 1.5).exterior.coords)
    geojson['features'].append({'type': 'Feature',
                                'geometry': {
                                    'type': 'Polygon',
                                    'coordinates': [hull_curr]
                                },
                                'properties': {'district': district}})

Looking at the borders

In [None]:
fig = px.choropleth_mapbox(data_counties, geojson=geojson, color='District',
                           locations='District', featureidkey='properties.district',
                           center={'lat': 42.315, 'lon': -71.1},
                           mapbox_style='carto-positron', zoom=10.5,
                           opacity=0.5)
fig.update_layout(margin={'r': 0, 't': 0, 'l': 0, 'b': 0})
fig.show()

#### Distributions of crimes

In [None]:
# Отрисовка гистограмм
def bar_chart(x_vals, y_vals, title=None, x_label=None, y_label=None, if_plot_vals=False):
    n = len(x_vals)
    x_pos = np.arange(n)

    plt.figure(figsize=(12, 8))
    plt.bar(x_pos, y_vals, align='center', alpha=0.6)
    plt.xticks(x_pos, x_vals)
    if title:
        plt.title(title)
    if x_label:
        plt.xlabel(x_label)
    if y_label:
        plt.ylabel(y_label)

    if if_plot_vals:
        for pos, val in zip(x_pos, y_vals):
            plt.text(pos, val, val, ha='center')

    plt.show()

All crimes by year

In [None]:
data_year = data.groupby(['Year']).size().reset_index(name='Counts')

bar_chart(data_year.Year, data_year.Counts, 'All crimes each year', 'Year', 'Counts')

In [None]:
data_2016 = data[data['Year'] == 2016]

In [None]:
data_month = data_2016.groupby(['YMonth']).size().reset_index(name='Counts')
data_month.YMonth.replace([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                          ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
                          inplace = True)

bar_chart(data_month.YMonth, data_month.Counts, 'Crimes each month (2016)', 'Month', 'Counts')

In [None]:
data_yday = data_2016.groupby(['YDay']).size().reset_index(name='Counts')

fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x='YDay',
             y='Counts',
             ax=ax,
             data=data_yday)
plt.title('Crimes each day (2016)')
plt.xlabel('Day')

In [None]:
data_wday = data_2016.groupby(['WDay']).size().reset_index(name='Counts')

bar_chart(data_wday.WDay, data_wday.Counts, 'Crimes each week day (2016)', 'Day', 'Count')

In [None]:
data_hour = data_2016.groupby(['DHour']).size().reset_index(name='Counts')

bar_chart(data_hour.DHour, data_hour.Counts, 'Crimes each hour (2016)', 'Hour', 'Count')

In [None]:
sns.catplot(y='UCR_part',
            kind='count',
            height=7,
            aspect=1.5,
            order=data_2016.UCR_part.value_counts().index,
            data=data_2016)

In [None]:
sns.catplot(y='District',
            kind='count',
            height=8,
            aspect=1.5,
            order=data_2016.District.value_counts().index,
            data=data_2016)