In [1]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import statsmodels.tsa.api as tsa
from pmdarima.model_selection import train_test_split
from pmdarima.arima.utils import ndiffs, nsdiffs

In [2]:
# Open the ZIP file
with zipfile.ZipFile('info/Chicago_Crime_2001-2022 (1).zip') as zf:
    file_list = zf.namelist()
    
    dfs = []
    
    for file_name in file_list:
        if file_name.endswith('.csv'):
            with zf.open(file_name) as f:
                df = pd.read_csv(f)
            dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

combined_df.to_csv('combined_data.csv', index=False)

print(combined_df)

               ID                    Date        Primary Type  \
0         1326041  01/01/2001 01:00:00 AM             BATTERY   
1         1319931  01/01/2001 01:00:00 PM             BATTERY   
2         1324743  01/01/2001 01:00:00 PM            GAMBLING   
3         1310717  01/01/2001 01:00:00 AM     CRIMINAL DAMAGE   
4         1318099  01/01/2001 01:00:00 AM             BATTERY   
...           ...                     ...                 ...   
7713104  12938029  12/31/2022 12:50:00 PM             ASSAULT   
7713105  12937822  12/31/2022 12:50:00 PM             ASSAULT   
7713106  12937583  12/31/2022 12:52:00 AM             BATTERY   
7713107  12938420  12/31/2022 12:52:00 PM             ASSAULT   
7713108  12939906  12/31/2022 12:59:00 PM  DECEPTIVE PRACTICE   

                                     Description  \
0                                         SIMPLE   
1                                         SIMPLE   
2                            ILLEGAL ILL LOTTERY   
3          

In [3]:
df= pd.read_csv('combined_data.csv')
df

Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude
0,1326041,01/01/2001 01:00:00 AM,BATTERY,SIMPLE,RESIDENCE,False,False,1624,16.0,,41.957850,-87.749185
1,1319931,01/01/2001 01:00:00 PM,BATTERY,SIMPLE,RESIDENCE,False,True,825,8.0,,41.783892,-87.684841
2,1324743,01/01/2001 01:00:00 PM,GAMBLING,ILLEGAL ILL LOTTERY,STREET,True,False,313,3.0,,41.780412,-87.611970
3,1310717,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,2424,24.0,,42.012391,-87.678032
4,1318099,01/01/2001 01:00:00 AM,BATTERY,SIMPLE,RESIDENCE PORCH/HALLWAY,False,True,214,2.0,,41.819538,-87.620020
...,...,...,...,...,...,...,...,...,...,...,...,...
7713104,12938029,12/31/2022 12:50:00 PM,ASSAULT,SIMPLE,APARTMENT,False,False,1014,10.0,24.0,41.855911,-87.719966
7713105,12937822,12/31/2022 12:50:00 PM,ASSAULT,AGGRAVATED - HANDGUN,APARTMENT,False,True,733,7.0,6.0,41.766546,-87.645669
7713106,12937583,12/31/2022 12:52:00 AM,BATTERY,SIMPLE,BARBERSHOP,False,False,1135,11.0,28.0,41.868829,-87.686098
7713107,12938420,12/31/2022 12:52:00 PM,ASSAULT,SIMPLE,COMMERCIAL / BUSINESS OFFICE,False,False,1432,14.0,32.0,41.930693,-87.685657


In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df.info()

In [None]:
df= df.set_index("Date")
df

In [None]:
ts = df.groupby("Primary Type").resample("M").size().unstack(level=0)
ts = ts_m.fillna(0)

In [None]:
ts['THEFT'] = ts['THEFT'].interpolate()

In [None]:
ts = ts['THEFT']
ts.plot();

In [None]:
decomp = tsa.seasonal_decompose(ts)
fig = decomp.plot()
fig.set_size_inches(12,5)
fig.tight_layout()

In [None]:
d = ndiffs(ts)
print (f'd = {d}')

In [None]:
D = nsdiffs(ts, m =12)
print(f'D = {D}')

In [None]:
ts_diff = ts.diff().dropna()

In [None]:
plot_acf_pacf(ts_diff, annotate_seas=True, m = 12);

In [None]:
train, test = train_test_split(ts, test_size=.25)
ax = train.plot(label='Train')
test.plot(ax=ax, label='Test')
ax.legend();

In [None]:
# Fitting a SARIMA model

# Orders for non seasonal components
p = 1  # nonseasonal AR
d = 1  # nonseasonal differencing
q = 1  # nonseasonal MA

# Orders for seasonal components
P = 1  # Seasonal AR
D = 0  # Seasonal differencing
Q = 1  # Seasonal MA
m = 12 # Seasonal period

sarima = tsa.ARIMA(train, order = (p,d,q), seasonal_order=(P,D,Q,m)).fit()

In [None]:
# Obtain summary of forecast as dataframe
forecast_df = sarima.get_forecast(len(test)).summary_frame()

plot_forecast(train, test, forecast_df)

regression_metrics_ts(test, forecast_df["mean"])

In [None]:
sarima.summary()

In [None]:
fig = sarima.plot_diagnostics()
fig.set_size_inches(10,6)
fig.tight_layout()