In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Time series

A time series is simply a series of data points ordered in time. As continuous monitoring and data collection become more common, the need for competent time series analysis with both statistical and machine learning techniques will increase.

Now, datasets where only one variable is observed at each time is called ‘Univariate Time Series’ and if two or more variables are observed at each time is called ‘Multivariate Time Series’.

In this notebook, we will focus on the univariate time series for forecasting the sales with Facebook Prophet and Auto ARIMA functionality in python

In [None]:
# load libraries 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import plotly.express as px



Load and combine the data

In [None]:
# combine and create single dataframe
chicago_df_1 = pd.read_csv('/kaggle/input/crimes-in-chicago/Chicago_Crimes_2001_to_2004.csv', error_bad_lines=False)
chicago_df_2 = pd.read_csv('/kaggle/input/crimes-in-chicago/Chicago_Crimes_2005_to_2007.csv', error_bad_lines=False)
chicago_df_3 = pd.read_csv('/kaggle/input/crimes-in-chicago/Chicago_Crimes_2008_to_2011.csv', error_bad_lines=False)
chicago_df_4 = pd.read_csv('/kaggle/input/crimes-in-chicago/Chicago_Crimes_2012_to_2017.csv', error_bad_lines=False)

In [None]:
# combining the datasets
df = pd.concat([chicago_df_1,chicago_df_2,chicago_df_3,chicago_df_4],ignore_index=False,axis=0)

In [None]:
df.shape

Exploring the dataset

In [None]:
# let's view the head of the training dataset
df.head()

In [None]:
# select only the necessary columns
df = df[['ID','Date','Primary Type','Location Description','Arrest','Domestic']]

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# change the column date dtype from object to date
df.Date = pd.to_datetime(df.Date, format='%m/%d/%Y %I:%M:%S %p')

In [None]:
# setting the index to be the date 
df.index = pd.DatetimeIndex(df.Date)

In [None]:
# verify the change
df.head()

In [None]:
# get the summary
print ("Rows     : " ,df.shape[0])
print ("Columns  : " ,df.shape[1])
print ("\nFeatures : \n\n" ,df.columns.tolist())
print ("\nMissing values : \n\n", df.isnull().any())
print ("\nUnique values :  \n\n",df.nunique())

In [None]:
# Examine the null records of Location Description
df[df["Location Description"].isnull()]


In [None]:
# drop these records
df = df.dropna()
# print the count of Null records in each column
df.isnull().sum()

# Data Analysis & Visualization

Plot the top 10 primary types

In [None]:
# set figure size
plt.figure(figsize = (15, 10))

# plot the records
ax=sns.countplot(x= 'Primary Type', data = df, order = df['Primary Type'].value_counts().iloc[:10].index, palette = 'RdBu_r')

# set individual bar lables using above list
for i in ax.patches:
    # get_x pulls left or right; get_height pushes up or down
    ax.text(i.get_x(), i.get_height(),
            str(i.get_height()), fontsize=15,
color='dimgrey')
show()


Plot the top 10 Location descriptions

In [None]:
# set the figure size
plt.figure(figsize = (15, 10))

# plot the values
ax = sns.countplot(y= 'Location Description', data = df, order = df['Location Description'].value_counts().iloc[:15].index,palette = 'RdBu_r')

# set individual bar lables using above list
for i in ax.patches:
    # get_width pulls left or right; get_y pushes up or down
    ax.text(i.get_width()+.3, i.get_y()+.5, 
            str(i.get_width()), fontsize=15,
color='dimgrey')
show()


In [None]:
# Resample is a Convenience method for frequency conversion and resampling of time series.

# resample into Years

plt.plot(df.resample('Y').size())
plt.title('Crimes Count Per Year')
plt.xlabel('Years')
plt.ylabel('Number of Crimes')

In [None]:
# resample into Months

plt.plot(df.resample('M').size())
plt.title('Crimes Count Per Month')
plt.xlabel('Months')
plt.ylabel('Number of Crimes')

Preparing the data for the model

In [None]:
# aggregating the number of cases per month for all years
ts_df = pd.DataFrame(df.resample('M').size().reset_index())
ts_df.columns = ['Date', 'Crime Count'] # renaming the columns

In [None]:
ts_df.head()

In [None]:
# plot interactive slider chart
fig = px.line(ts_df, x='Date',y='Crime Count', title= 'Crime count')

fig.update_xaxes(
rangeslider_visible =True,
rangeselector=dict(
        buttons=list([
                dict(count=1,label="1y",step="year",stepmode="backward"),
                dict(count=2,label="3y",step="year",stepmode="backward"),
                dict(count=3,label="5y",step="year",stepmode="backward"),
                dict(step="all")
                    ])
                )
                )
fig.show()

In [None]:
ts_df = ts_df.set_index('Date')

Split into train and test datasets to build the model on the training dataset and forecast using the test dataset.

In [None]:
# splitting into train and test set
train = ts_df[:181]
test = ts_df[181:]
print(train.shape)
print(test.shape)

In [None]:
plt.plot(train)
plt.plot(test)

# Build Fb prophet model

Prophet is open source software released by Facebook’s Core Data Science team.

Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects.

Prophet works best with time series that have strong seasonal effects and several seasons of historical data.

For more information, please check this out: https://research.fb.com/prophet-forecasting-at-scale/ https://facebook.github.io/prophet/docs/quick_start.html#python-api

In [None]:
from fbprophet import Prophet

In [None]:
# creating the dataframe
prophet_df = train.reset_index()
prophet_df .head()

* Rename the columns as ds and y for the model

In [None]:
prophet_df = prophet_df.rename(columns= {'Date':'ds','Crime Count':'y'})
prophet_df.head()

In [None]:
m = Prophet()
m.fit(prophet_df)


In [None]:
# Forcasting into the future
future = m.make_future_dataframe(periods=12, freq='M')
forecast = m.predict(future)

In [None]:
forecast.head()

In [None]:
figure = m.plot(forecast, xlabel='Date', ylabel='Crime Rate')

In [None]:
plot = m.plot_components(forecast)

In [None]:
forecast_df = pd.DataFrame(forecast)
forecast_df.head()

In [None]:
# preparing the dataframe with date and forecast
forecast_df=forecast_df[['ds','yhat']]
forecast_df=forecast_df.set_index('ds')
forecast_df.head()

In [None]:
# plot the predictions
plt.figure(figsize=(20,10))
plt.plot(train, label ='Train')
plt.plot(test, label='Test')
plt.plot(forecast_df, label='Forecast')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Plotting Train vs Test vs Predicted Crime rate')
plt.legend()
plt.show()

In [None]:
test['Crime Count']

In [None]:
forecast_df['yhat'][181:]

In [None]:
# calculate error
test['fbprophet_error'] = test['Crime Count'] - forecast_df['yhat'][181:]

rmse = np.sqrt(np.mean(test.fbprophet_error**2)).round(2)
mape = np.round(np.mean(np.abs(100*(test.fbprophet_error/test['Crime Count'])), 0))

print('RMSE = $', rmse)
print('MAPE =', mape, '%')

# Build Auto Arima Model

ARIMA is an acronym for Auto Regressive (AR) Integrated (I) Moving Average (MA) which indicates that an ARIMA model has three components to it.

In [None]:
!pip install pmdarima

In [None]:
import pmdarima as pm

In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(train)

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(train,lags=20)

Here, we are trying with the p, d, q values ranging from 0 to 5 to get better optimal values from the model. There are many other parameters in this model and to know more about the functionality, visit this link [[here]](https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html)

![](https://miro.medium.com/max/1148/1*64ZOjhR_jsZ9D-E51MKvBQ.png)
* Auto-Regressive (p) -> Number of autoregressive terms.
* Integrated (d) -> Number of nonseasonal differences needed for stationarity.
* Moving Average (q) -> Number of lagged forecast errors in the prediction equation.

In [None]:
model = pm.auto_arima(train,m=12,start_p=0,start_q=3, max_order=5, 
                      error_action='ignore',test='adf',seasonal=True,
                      trace=True,stepwise=True)

In [None]:
model.summary()

The output above shows that the final model fitted was an ARIMA(0,1,3) estimator, where the values of the parameters p, d, and q were zero, one, and three, respectively. 

In [None]:
# create a dataframe with test date index
prediction = pd.DataFrame(model.predict(n_periods=12),index = test.index,columns =['Predicted Crime Count'])

In [None]:
# print predictions
prediction

In [None]:
test

In [None]:
# plot the predictions
plt.plot(train, label ='Train')
plt.plot(test['Crime Count'], label='Test')
plt.plot(prediction, label='Prediction')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Plotting Train vs Test vs Predicted Crime rate')
plt.legend()
plt.show()

In [None]:
# calculate error
test['arima_error'] = test['Crime Count'] - prediction['Predicted Crime Count']

rmse = np.sqrt(np.mean(test.arima_error**2)).round(2)
mape = np.round(np.mean(np.abs(100*(test.arima_error/test['Crime Count'])), 0))

print('RMSE = $', rmse)
print('MAPE =', mape, '%')

In [None]:
out = model.plot_diagnostics()

# **Hello! I Hope you are well, if you find this notebook helpful please upvote and support my work.**