In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import random
import tqdm
import seaborn as sns
from keras.utils.vis_utils import plot_model
from math import sqrt
from sklearn.metrics import mean_squared_error, r2_score
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import LSTM
from keras.layers import TimeDistributed
from fbprophet import Prophet 

# **Reading Data**

In [None]:
chicago_df_1 = pd.read_csv('/kaggle/input/crimes-in-chicago/Chicago_Crimes_2001_to_2004.csv', error_bad_lines=False)
chicago_df_2 = pd.read_csv('/kaggle/input/crimes-in-chicago/Chicago_Crimes_2005_to_2007.csv', error_bad_lines=False)
chicago_df_3 = pd.read_csv('/kaggle/input/crimes-in-chicago/Chicago_Crimes_2008_to_2011.csv', error_bad_lines=False)
chicago_df_4 = pd.read_csv('/kaggle/input/crimes-in-chicago/Chicago_Crimes_2012_to_2017.csv', error_bad_lines=False)

# Neural Network Models

In [None]:
df = pd.concat([chicago_df_1, chicago_df_2, chicago_df_3, chicago_df_4], ignore_index=False, axis=0)
df.drop(['Unnamed: 0', 'Case Number', 'Case Number', 'IUCR', 'X Coordinate', 'Y Coordinate','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location', 'District', 'Latitude' , 'Longitude'], inplace=True, axis=1)

In [None]:
df.index = pd.DatetimeIndex(df.Date)
data = df.resample('M').size().reset_index()
data

In [None]:
data["Date"] = data["Date"].dt.strftime("%m/%d/%Y")

In [None]:
data.columns = ['Date', 'Crime Count']
data.dtypes

In [None]:
data

In [None]:
ip = np.asarray(data['Crime Count'].values)
ip = np.asarray([[i] for i in ip])
ip

In [None]:
def train_test_splitting(data, n_test):
    return data[:-n_test], data[-n_test:]

# transform list into supervised learning format
def series_to_supervised(data, n_in=1, n_out=1):
    df = pd.DataFrame(data)
    cols = list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    # concatenate together
    dframe = pd.concat(cols, axis=1)
    # drop rows with NaN values
    dframe.dropna(inplace=True)
    return dframe.values
 
# root mean squared error or rmse
def measure_error(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted)), r2_score(actual, predicted)

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
    predictions = list()
    # split dataset
    train, test = train_test_splitting(data, n_test)
    # fit model
    model = model_fit(train, cfg)
    
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # fit model and make forecast for history
        yhat = model_predict(model, history, cfg)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
    # estimate prediction error
    rmse, r2_score = measure_error(test, predictions)
    print(' RMSE: %.3f \t R2 score: %.3f' % (rmse, r2_score))
    return rmse, r2_score , model
 
# repeat evaluation of a config
def repeat_evaluate(data, config, n_test, n_repeats=30):
    # fit and evaluate the model n times
    scores = [walk_forward_validation(data, n_test, config) for _ in range(n_repeats)]
    return scores
 
# summarize model performance
def summarize_scores(name, scores):
    # print a summary
    rmse_scores = [i[0] for i in scores]
    r2_scores = [i[1] for i in scores]
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    print('%s' % name)
    print('RMSE: %.3f (+/- %.3f)' % (rmse_mean, rmse_std))
    print('R2: %.3f (+/- %.3f)' % (r2_mean, r2_std))
    # box and whisker plot
    plt.boxplot(rmse_scores)
    plt.show()
    plt.boxplot(r2_scores)
    plt.show()

## Multi Layer Perceptron model

In [None]:
model = Sequential()
# fit a model
def model_fit(train, config):
    # unpack config
    n_input, n_nodes, n_epochs, n_batch = config
    # prepare data
    data = series_to_supervised(train, n_in=n_input)
    train_x, train_y = data[:, :-1], data[:, -1]
    # define model
    mlp_model = Sequential()
    mlp_model.add(Dense(n_nodes, activation='relu', input_dim=n_input))
    mlp_model.add(Dense(3))
    mlp_model.add(Dense(1))
    mlp_model.compile(loss='mse', optimizer='adam')
        
    # fit
    mlp_model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)
    return mlp_model
 
# forecast with a pre-fit model
def model_predict(model, history, config):
    # unpack config
    n_input, _, _, _ = config
    # prepare data
    x_input = np.array(history[-n_input:]).reshape(1, n_input)
    # forecast
    yhat = model.predict(x_input, verbose=0)
    return yhat[0]


In [None]:
# define config
config = [24, 250, 100, 100]
n_test = 12
# grid search
scores = repeat_evaluate(ip, config, n_test)
mlp_model = scores[0][2]
# summarize scores
summarize_scores('mlp', scores)

In [None]:
plot_model(mlp_model, to_file='mlp_model.png', show_shapes=True, show_layer_names=True)

## Convolutional Neural Network Model

In [None]:
def model_fit(train, config):
    # unpack config
    n_input, n_filters, n_kernel, n_epochs, n_batch = config
    # prepare data
    data = series_to_supervised(train, n_in=n_input)
    train_x, train_y = data[:, :-1], data[:, -1]
    train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], 1))
    # define model
    model = Sequential()
    model.add(Conv1D(filters=n_filters, kernel_size=n_kernel, activation='relu', input_shape=(n_input, 1)))
    model.add(Conv1D(filters=n_filters, kernel_size=n_kernel, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=n_filters, kernel_size=n_kernel, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    # fit
    model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)
    return model

# forecast with a pre-fit model
def model_predict(model, history, config):
    # unpack config
    n_input, _, _, _, _ = config
    # prepare data
    x_input = np.array(history[-n_input:]).reshape((1, n_input, 1))
    # forecast
    yhat = model.predict(x_input, verbose=0)
    return yhat[0]

In [None]:
n_test = 12
# define config
config = [24, 256, 3, 100, 100]
# grid search
scores = repeat_evaluate(ip, config, n_test)
cnn_model = scores[0][2]
# summarize scores
summarize_scores('cnn', scores)

In [None]:
plot_model(cnn_model, to_file='cnn_model.png', show_shapes=True, show_layer_names=True)

## Recurrent Neural Network Model with LSTM

In [None]:
# difference dataset
def difference(data, interval):
    return [data[i] - data[i - interval] for i in range(interval, len(data))]
 
# fit a model
def model_fit(train, config):
    # unpack config
    n_input, n_nodes, n_epochs, n_batch, n_diff = config
    # prepare data
    if n_diff > 0:
        train = difference(train, n_diff)
    data = series_to_supervised(train, n_in=n_input)
    train_x, train_y = data[:, :-1], data[:, -1]
    train_x = train_x.reshape((train_x.shape[0], train_x.shape[1], 1))
    # define model
    model = Sequential()
    model.add(LSTM(n_nodes, activation='relu', input_shape=(n_input, 1)))
    model.add(Dense(n_nodes, activation='relu'))
    model.add(Dense(n_nodes, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    
    # fit
    model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)
    return model
 
# forecast with a pre-fit model
def model_predict(model, history, config):
    # unpack config
    n_input, _, _, _, n_diff = config
    # prepare data
    correction = 0.0
    if n_diff > 0:
        correction = history[-n_diff]
        history = difference(history, n_diff)
    x_input = np.array(history[-n_input:]).reshape((1, n_input, 1))
    # forecast
    yhat = model.predict(x_input, verbose=0)
    return correction + yhat[0]

In [None]:
n_test = 12
# define config
config = [24, 50, 100, 100, 12]
# grid search
scores = repeat_evaluate(ip, config, n_test)
rnn_model = scores[0][2]
# summarize scores
summarize_scores('lstm', scores)

In [None]:
plot_model(rnn_model, to_file='rnn_model.png', show_shapes=True, show_layer_names=True)

## CNN with LSTM

In [None]:
# fit a model
def model_fit(train, config):
    # unpack config
    n_seq, n_steps, n_filters, n_kernel, n_nodes, n_epochs, n_batch = config
    n_input = n_seq * n_steps
    # prepare data
    data = series_to_supervised(train, n_in=n_input)
    train_x, train_y = data[:, :-1], data[:, -1]
    train_x = train_x.reshape((train_x.shape[0], n_seq, n_steps, 1))
    # define model
    model = Sequential()
    model.add(TimeDistributed(Conv1D(filters=n_filters, kernel_size=n_kernel, activation='relu', input_shape=(None,n_steps,1))))
    model.add(TimeDistributed(Conv1D(filters=n_filters, kernel_size=n_kernel, activation='relu')))
    model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(n_nodes, activation='relu'))
    model.add(Dense(n_nodes, activation='relu'))
    model.add(Dense(n_nodes, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam')
    
    # fit
    model.fit(train_x, train_y, epochs=n_epochs, batch_size=n_batch, verbose=0)
    return model
 
# forecast with a pre-fit model
def model_predict(model, history, config):
    # unpack config
    n_seq, n_steps, _, _, _, _, _ = config
    n_input = n_seq * n_steps
    # prepare data
    x_input = np.array(history[-n_input:]).reshape((1, n_seq, n_steps, 1))
    # forecast
    yhat = model.predict(x_input, verbose=0)
    return yhat[0]

In [None]:
n_test = 12
# define config
config = [3, 12, 64, 3, 100, 200, 100]
# grid search
scores = repeat_evaluate(ip, config, n_test)
cnn_lstm_model = scores[0][2]
# summarize scores
summarize_scores('cnn-lstm', scores)

In [None]:
plot_model(cnn_lstm_model, to_file='cnn_lstm_model.png', show_shapes=True, show_layer_names=True)

# Prophet

In [None]:
df = pd.concat([chicago_df_1, chicago_df_2, chicago_df_3], ignore_index=False, axis=0)
test_df = chicago_df_4

In [None]:
df.head()

In [None]:
test_df.head()

In [None]:
df.shape, test_df.shape

In [None]:
df.isna().sum()

In [None]:
# Dropping the following columns: ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
df.drop(['Unnamed: 0', 'Case Number', 'Case Number', 'IUCR', 'X Coordinate', 'Y Coordinate','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location', 'District', 'Latitude' , 'Longitude'], inplace=True, axis=1)
test_df.drop(['Unnamed: 0', 'Case Number', 'Case Number', 'IUCR', 'X Coordinate', 'Y Coordinate','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location', 'District', 'Latitude' , 'Longitude'], inplace=True, axis=1)

In [None]:
df.Date = pd.to_datetime(df.Date, format='%m/%d/%Y %I:%M:%S %p')
test_df.Date = pd.to_datetime(test_df.Date, format='%m/%d/%Y %I:%M:%S %p')

In [None]:
df.index = pd.DatetimeIndex(df.Date)
test_df.index = pd.DatetimeIndex(test_df.Date)

In [None]:
df['Primary Type'].value_counts()

In [None]:
df.resample('M').size()
test_df.resample('M').size()

In [None]:
plt.plot(df.resample('M').size())
plt.title('Crimes Count Per Month')
plt.xlabel('Months')
plt.ylabel('Number of Crimes')

## Model

In [None]:
prophet = df.resample('M').size().reset_index()
prophet.columns = ['Date', 'Crime Count']
prophet

In [None]:
prophet_df = pd.DataFrame(prophet)
prophet_df

In [None]:
prophet_df_final = prophet_df.rename(columns={'Date':'ds', 'Crime Count':'y'})

In [None]:
prop = Prophet()
prop.fit(prophet_df_final)

In [None]:
future = prop.make_future_dataframe(periods=1858)  #periods = no. of days for prediction
forecast = prop.predict(future)

In [None]:
forecast

In [None]:
preds_df = forecast[132:]
preds_df

In [None]:
preds_df.Date = pd.to_datetime(preds_df.ds, format='%m/%d/%Y %I:%M:%S %p')
preds_df.index = pd.DatetimeIndex(preds_df.Date)

In [None]:
preds = preds_df.yhat.resample('M').sum()/100

In [None]:
targets = test_df.resample('M').size()
reqd = np.asarray(targets.values)
ans = np.asarray(preds.values)

In [None]:
print('RMSE: %.3f' % sqrt(mean_squared_error(reqd, ans)))
print('R2 score: %.3f' % r2_score(reqd, ans))

In [None]:
figure = prop.plot(forecast, xlabel='Date', ylabel='Crime Rate')

In [None]:
figure = prop.plot_components(forecast)