# Importing dataset and libraries

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

dataset = pd.concat(
    [
        pd.read_csv(
            "../input/crimes-in-chicago/Chicago_Crimes_2005_to_2007.csv",
            error_bad_lines=False,
        ),
        pd.read_csv(
            "../input/crimes-in-chicago/Chicago_Crimes_2008_to_2011.csv",
            error_bad_lines=False,
        ),
        pd.read_csv(
            "../input/crimes-in-chicago/Chicago_Crimes_2012_to_2017.csv",
            error_bad_lines=False,
        ),
    ]
)
dataset.info()

Initial dataset shape.

In [None]:
print('Dataset shape ->', dataset.shape)
dataset.head()

How many missing values we have per columns.

In [None]:
dataset.isna().sum()

Selecting a few columsn do be used during the exploratory data analysis.

Removing the year 2017 due be is not full.

In [None]:
data_ead = dataset[dataset.Year != 2017].drop(
    [
        "Unnamed: 0",
        "ID",
        "IUCR",
        "Beat",
        "Case Number",
        "District",
        "Ward",
        "Updated On",
        "Year",
        "Community Area",
        "X Coordinate",
        "Y Coordinate",
        "Latitude",
        "Longitude",
        "Location",
        "FBI Code",
    ],
    axis=1,
)

Transforming the Date in Datetime...

In [None]:
data_ead.Date = pd.to_datetime(data_ead.Date, format="%m/%d/%Y %I:%M:%S %p")
data_ead.set_index("Date", inplace=True)

## Number of cases by type

In this part, let's try to indentify the tend and highpoints or lowpoints.

### Primary Type

In [None]:
# Function to plot the serie value_counts
def plot_counts(serie, title):
    df = pd.DataFrame(serie.value_counts()[:15])
    df.columns = ["Freq"]
    df["Type"] = df.index
    fig = px.bar(df, y="Freq", x="Type", text="Freq", color="Freq", color_continuous_scale=px.colors.sequential.Blugrn)
    fig.update_traces(texttemplate="%{text:.2s}", textposition="outside")
    fig.update_layout(uniformtext_minsize=8, uniformtext_mode="hide")
    fig.update_layout(title_text=title)
    fig.show()


plot_counts(data_ead["Primary Type"], "Kind of Crimes")

THEFT, BATTERY, CRIMINAL DAMAGE and NARCOTICS represents more the 65% of the crimes. 

### Location Description
Where those crimes happens? 

In [None]:
plot_counts(data_ead["Location Description"], "Location of Crimes")

STREET and SIDEWALK represents more the 1/3 of the crimes. 

## Number of cases per Year x Months
Let's see the number of crimes per Year and Months...

In [None]:
aux = pd.DataFrame(data_ead.resample("M").size(), columns=["Number of cases"])
aux["Month"] = aux.index.month.astype(str)
aux["Year"] = aux.index.year.astype(str)
aux["Year-Month"] = aux["Year"].str.cat(aux["Month"].str.zfill(2), sep="-")
aux.head()

### by Heatmap

In [None]:
aux_csm = aux[["Month", "Year", "Number of cases"]]
aux_csm = aux_csm.pivot("Month", "Year", "Number of cases").fillna(0).astype(int)
aux_csm.index = aux_csm.index.astype(int)
aux_csm = aux_csm.sort_index()
aux_csm.index = ['Jan', 'Fev', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
plt.figure(figsize=(20, 10))
_ = sns.heatmap(
    aux_csm,
    annot=True,
    fmt="d",
    linewidths=0.5,
    cmap="Blues",
).set_title('Number of crimes in chicago per Year x Months')

The years between 2006 and 2010 were a tough time, the year 2008 was the worst with a mean of 71004 crimes per month, and the best year was the 2015 with a mean of 21916 crimes per month.

### by Bars 

In [None]:
fig = px.bar(
    aux,
    x="Year-Month",
    y="Number of cases",
    hover_data=["Year", "Month", "Number of cases"],
    color="Number of cases",
    height=600,
    width=1400,
    color_continuous_scale=px.colors.sequential.Blugrn

)
fig.update_layout(title_text='Crime counts per year-month in bars')
fig.show()

Similar graph but using bars to visualize. Again the 'tough time' during 2006 until 2010. 



### Agregate by Months


In [None]:
agg_months = aux.groupby(['Month'])['Number of cases'].sum().reset_index()
agg_months.Month = agg_months.Month.astype(int)
agg_months.sort_values('Month',inplace=True)
agg_months.Month = ['Jan', 'Fev', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

fig = px.bar(
    agg_months,
    x="Month",
    y="Number of cases",
    color="Number of cases",
    text="Number of cases",
    height=600,
    width=1400,
    color_continuous_scale=px.colors.sequential.Blugrn
)
fig.update_traces(texttemplate="%{text:.2s}", textposition="outside")
fig.update_layout(title_text="Crime counts per year-month in stacked bars (Month)")
fig.show()

Looks like in the summer have a tend for more crimes in chicago.

### Agregate by Year

In [None]:
fig = px.bar(
    aux.groupby(['Year'])['Number of cases'].sum().reset_index(),
    x="Year",
    y="Number of cases",
    color="Number of cases",
    text="Number of cases",
    height=600,
    width=1400,
        color_continuous_scale=px.colors.sequential.Blugrn

)
fig.update_traces(texttemplate="%{text:.2s}", textposition="outside")
fig.update_layout(title_text="Crime counts per year-month in stacked bars (Years)")

fig.show()

Similar graph but using bars to visualize. Again the 'tough time' during 2006 until 2010. 


## Map plot

There is some area or specific location that used to happen crimes? 

In [None]:
map_marks = dataset[['Latitude', 'Longitude']]
map_marks['Year'] =  pd.to_datetime(dataset.Date, format="%m/%d/%Y %I:%M:%S %p").dt.year
map_marks = map_marks.dropna()
map_marks.head()

In [None]:
fig = px.scatter_mapbox(map_marks[map_marks.Year == 2016].sample(5000), lat="Latitude", lon="Longitude",
                        color_discrete_sequence=["fuchsia"], zoom=9, height=650)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


Look like there is crime everywhere in Chicago, in the centre, you can see more points but also is a crowded place.

## Arrested?
What is the arrested rate per crime?

In [None]:
aux = data_ead.groupby('Primary Type')['Arrest'].mean().reset_index()
aux['Arrest (%)'] = aux['Arrest'].round(4)*100
aux.sort_values('Arrest', inplace=True)
fig = px.bar(
    aux,
    orientation='h',
    x="Arrest (%)",
    y="Primary Type",
    color="Arrest (%)",
    text="Arrest (%)",
    height=800,
    #width=800,
        color_continuous_scale=px.colors.sequential.Blugrn

)
fig.update_traces(texttemplate="%{text:.4s}", textposition="outside")
fig.update_layout(title_text="Arrested Rate in % per Type Crime in Chicago.")

fig.show()

The “VIOLATION” crimes look strongly correlated to the arrested rate once the 'cop' must be present in the location to declare the violation. 

# Model - Forecast

Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.

In [None]:
data_time = dataset.drop(
    [
        "Unnamed: 0",
        "ID",
        "IUCR",
        "Beat",
        "Case Number",
        "District",
        "Ward",
        "Updated On",
        "Year",
        "Community Area",
        "X Coordinate",
        "Y Coordinate",
        "Latitude",
        "Longitude",
        "Location",
        "FBI Code",
    ],
    axis=1,
)
data_time.Date = pd.to_datetime(data_time.Date, format="%m/%d/%Y %I:%M:%S %p")
data_time.set_index('Date', inplace=True)
data_time.sample(3)

In [None]:
data_model = pd.DataFrame(data_time.resample("M").size().reset_index())
data_model.columns = ["ds", "y"]
print(data_model.shape)
data_model.head()

### What is the period?

First of all, let's plot the whole series to see the data behaviour.

In [None]:
fig = px.line(data_model, x="ds", y="y")
fig.update_layout(
    title="Chicago crimes from 2005 to 2016 (counted per Month).",
    xaxis_title="Date",
    yaxis_title="Crimes",
    font=dict(
        size=14,
    )
)

In this time series we have the crimes per month, a total per month.

### Modeling

In [None]:
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation, performance_metrics
from fbprophet.plot import plot_plotly, plot_components_plotly
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
import itertools


def create_param_combinations(**param_dict):
    param_iter = itertools.product(*param_dict.values())
    params = []
    for param in param_iter:
        params.append(param)
    params_df = pd.DataFrame(params, columns=list(param_dict.keys()))
    return params_df


def train_return_results(params):
    model = Prophet(**params)
    model.fit(data_model[75:-6])

    future = model.make_future_dataframe(periods=6, freq="M")
    forecast = model.predict(future)

    return mean_squared_error(data_model["y"][-6:], forecast["yhat"][-6:])


param_grid = {
    "changepoint_prior_scale": [0.005, 0.05, 0.5, 5],
    "changepoint_range": [0.8, 0.9],
    "seasonality_prior_scale": [0.1, 1, 10.0],
    "holidays_prior_scale": [0.1, 1, 10.0],
    "seasonality_mode": ["multiplicative", "additive"],
    # "growth": ["linear", "logistic"],
    "yearly_seasonality": [5, 10, 20],
}

# Generate all combinations of parameters
all_params = create_param_combinations(**param_grid)

# Use cross validation to evaluate all parameters
rmses = Parallel(n_jobs=-1, verbose=10)(
    delayed(train_return_results)(dict(zip(all_params.keys(), param)))
    for param in all_params.values
)

In [None]:
all_params["rsme"] = rmses
best_paramers = dict(all_params.sort_values("rsme").drop("rsme", axis=1).iloc[0])
print(best_paramers)

In [None]:
model = Prophet(**best_paramers)
model.fit(data_model[75:])

future = model.make_future_dataframe(periods=5, freq="M")
forecast = model.predict(future)

## Result plot

In [None]:
plot_plotly(model, forecast)

The model adapted well to this part of it, the sasionality is clear in the graph and also we can see a decrease trend.

In [None]:
plot_components_plotly(model, forecast)

There is a reduction trend and the sasionality is strong in the first part of the year.

In [None]:
forecast.tail(6)[['ds','yhat','yhat_lower','yhat_upper']].round()

In [None]:
yhat = str(int(forecast.iloc[-1]['yhat']))
yhat_lower = str(int(forecast.iloc[-1]['yhat_lower']))
yhat_upper = str(int(forecast.iloc[-1]['yhat_upper']))
print('Following the model, for July we have', yhat ,"crimes and considering the error margin it's possible to have a value between",yhat_lower, '~', yhat_upper,'.')