In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# default code, prefer not to touch it
        
%matplotlib inline             
from statsmodels.tsa.seasonal import seasonal_decompose # seasonal decompose
from statsmodels.tsa.arima_model import ARMA # ARMA
from pandas.plotting import autocorrelation_plot #Autocorrelation
from statsmodels.tsa.stattools import adfuller # for Dickey-Fuller test
from sklearn import metrics 
import statsmodels.api as sm 
import scipy.stats as scs
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import datetime # manipulating date formats
import matplotlib.pyplot as plt # basic plotting
import math



import warnings
warnings.filterwarnings("ignore") # turn off warnings

In [None]:
df=pd.read_csv("../input/us-police-shootings/shootings.csv")
df.date=df.date.apply(lambda x:datetime.datetime.strptime(x, '%Y-%m-%d'))
df["counter"] = "1"
df["counter"] = df["counter"].apply(pd.to_numeric)
# check
print(df.info())


In [None]:
df.drop(['id', 'name', 'manner_of_death', 'armed', 'age', 'gender', 'race', 'city', 'state', 'signs_of_mental_illness', 'threat_level', 'flee', 'body_camera', 'arms_category'], inplace=True, axis = 1)

In [None]:
wdf = df.groupby(pd.Grouper(key='date',freq='5D')).sum()

In [None]:
plt.figure(figsize=(16,6))
plt.xlabel('date')
plt.ylabel('Shootings')
plt.plot(wdf)

In [None]:
plt.figure(figsize=(16,6))
plt.plot(wdf.rolling(window=12,center=False).mean(),label='Rolling Mean');
plt.plot(wdf.rolling(window=12,center=False).std(),label='Rolling sd');
plt.legend();

In [None]:
def test_stationarity(timeseries):
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

test_stationarity(wdf)

In [None]:
plt.figure(figsize=(16,30))
decomposed_shootings_volume = sm.tsa.seasonal_decompose(wdf,freq=12) # The frequncy is annual
figure = decomposed_shootings_volume.plot()
plt.show()

In [None]:
model = ARIMA(wdf, order=(4,0,2))
result = model.fit()
print(result.summary())

In [None]:
plt.figure(figsize=(1,6))
result.plot_predict()
plt.title('RSS: %.4f'%sum((result.fittedvalues - wdf['counter'])**2))

In [None]:
model = ARIMA(wdf.iloc[1:390], order=(4,0,2))
result = model.fit()
print(result.summary())

In [None]:
plt.figure(figsize=(30,6))
plt.plot(result.predict(end=399))

In [None]:
plt.figure(figsize=(30,6))
plt.plot(wdf)
plt.plot(result.predict(end=399))


In [None]:
best_aic = np.inf 
best_order = None
best_mdl = None

rng = range(5)
for i in rng:
    for j in rng:
        try:
            smt.ARMA(wdf, order=(i, j)).fit(method='mle', trend='nc')
            tmp_mdl = sum((result.fittedvalues - wdf['counter'])**2)
            tmp_aic = tmp_mdl
            if tmp_aic < best_aic:
                best_aic = tmp_aic
                best_order = (i, j)
                best_mdl = tmp_mdl
        except: continue


print('RSS: {:6.5f} | order: {}'.format(best_aic, best_order))

In [None]:
best_aic = np.inf 
best_order = None
best_mdl = None

rng = range(5)
for i in rng:
    for j in rng:
        try:
            tmp_mdl = smt.ARMA(wdf, order=(i, j)).fit(method='mle', trend='nc')
            tmp_aic = tmp_mdl.aic
            if tmp_aic < best_aic:
                best_aic = tmp_aic
                best_order = (i, j)
                best_mdl = tmp_mdl
        except: continue


print('aic: {:6.5f} | order: {}'.format(best_aic, best_order))

In [None]:
autocorrelation_plot(wdf)
plt.show()