In [1]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import statsmodels.tsa.api as tsa
from pmdarima.model_selection import train_test_split
from pmdarima.arima.utils import ndiffs, nsdiffs

In [2]:
# Open the ZIP file
with zipfile.ZipFile('info/Chicago_Crime_2001-2022 (1).zip') as zf:
    file_list = zf.namelist()
    
    dfs = []
    
    for file_name in file_list:
        if file_name.endswith('.csv'):
            with zf.open(file_name) as f:
                df = pd.read_csv(f)
            dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

combined_df.to_csv('combined_data.csv', index=False)

print(combined_df)

               ID                    Date        Primary Type  \
0         1326041  01/01/2001 01:00:00 AM             BATTERY   
1         1319931  01/01/2001 01:00:00 PM             BATTERY   
2         1324743  01/01/2001 01:00:00 PM            GAMBLING   
3         1310717  01/01/2001 01:00:00 AM     CRIMINAL DAMAGE   
4         1318099  01/01/2001 01:00:00 AM             BATTERY   
...           ...                     ...                 ...   
7713104  12938029  12/31/2022 12:50:00 PM             ASSAULT   
7713105  12937822  12/31/2022 12:50:00 PM             ASSAULT   
7713106  12937583  12/31/2022 12:52:00 AM             BATTERY   
7713107  12938420  12/31/2022 12:52:00 PM             ASSAULT   
7713108  12939906  12/31/2022 12:59:00 PM  DECEPTIVE PRACTICE   

                                     Description  \
0                                         SIMPLE   
1                                         SIMPLE   
2                            ILLEGAL ILL LOTTERY   
3          

In [3]:
df= pd.read_csv('combined_data.csv')
df

Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude
0,1326041,01/01/2001 01:00:00 AM,BATTERY,SIMPLE,RESIDENCE,False,False,1624,16.0,,41.957850,-87.749185
1,1319931,01/01/2001 01:00:00 PM,BATTERY,SIMPLE,RESIDENCE,False,True,825,8.0,,41.783892,-87.684841
2,1324743,01/01/2001 01:00:00 PM,GAMBLING,ILLEGAL ILL LOTTERY,STREET,True,False,313,3.0,,41.780412,-87.611970
3,1310717,01/01/2001 01:00:00 AM,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,2424,24.0,,42.012391,-87.678032
4,1318099,01/01/2001 01:00:00 AM,BATTERY,SIMPLE,RESIDENCE PORCH/HALLWAY,False,True,214,2.0,,41.819538,-87.620020
...,...,...,...,...,...,...,...,...,...,...,...,...
7713104,12938029,12/31/2022 12:50:00 PM,ASSAULT,SIMPLE,APARTMENT,False,False,1014,10.0,24.0,41.855911,-87.719966
7713105,12937822,12/31/2022 12:50:00 PM,ASSAULT,AGGRAVATED - HANDGUN,APARTMENT,False,True,733,7.0,6.0,41.766546,-87.645669
7713106,12937583,12/31/2022 12:52:00 AM,BATTERY,SIMPLE,BARBERSHOP,False,False,1135,11.0,28.0,41.868829,-87.686098
7713107,12938420,12/31/2022 12:52:00 PM,ASSAULT,SIMPLE,COMMERCIAL / BUSINESS OFFICE,False,False,1432,14.0,32.0,41.930693,-87.685657


In [4]:
df['Date']= pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7713109 entries, 0 to 7713108
Data columns (total 12 columns):
 #   Column                Dtype         
---  ------                -----         
 0   ID                    int64         
 1   Date                  datetime64[ns]
 2   Primary Type          object        
 3   Description           object        
 4   Location Description  object        
 5   Arrest                bool          
 6   Domestic              bool          
 7   Beat                  int64         
 8   District              float64       
 9   Ward                  float64       
 10  Latitude              float64       
 11  Longitude             float64       
dtypes: bool(2), datetime64[ns](1), float64(4), int64(2), object(3)
memory usage: 603.2+ MB


In [5]:
df= df.set_index("Date")
df

Unnamed: 0_level_0,ID,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2001-01-01 01:00:00,1326041,BATTERY,SIMPLE,RESIDENCE,False,False,1624,16.0,,41.957850,-87.749185
2001-01-01 13:00:00,1319931,BATTERY,SIMPLE,RESIDENCE,False,True,825,8.0,,41.783892,-87.684841
2001-01-01 13:00:00,1324743,GAMBLING,ILLEGAL ILL LOTTERY,STREET,True,False,313,3.0,,41.780412,-87.611970
2001-01-01 01:00:00,1310717,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,2424,24.0,,42.012391,-87.678032
2001-01-01 01:00:00,1318099,BATTERY,SIMPLE,RESIDENCE PORCH/HALLWAY,False,True,214,2.0,,41.819538,-87.620020
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 12:50:00,12938029,ASSAULT,SIMPLE,APARTMENT,False,False,1014,10.0,24.0,41.855911,-87.719966
2022-12-31 12:50:00,12937822,ASSAULT,AGGRAVATED - HANDGUN,APARTMENT,False,True,733,7.0,6.0,41.766546,-87.645669
2022-12-31 00:52:00,12937583,BATTERY,SIMPLE,BARBERSHOP,False,False,1135,11.0,28.0,41.868829,-87.686098
2022-12-31 12:52:00,12938420,ASSAULT,SIMPLE,COMMERCIAL / BUSINESS OFFICE,False,False,1432,14.0,32.0,41.930693,-87.685657


In [6]:
ts = df.groupby('Primary Type').resample("Y").sum()
ts

  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary Type').resample("Y").sum()
  ts = df.groupby('Primary T

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude
Primary Type,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ARSON,2001-12-31,1635830865,181,60,1247579,11870.0,103.0,42047.307400,-88118.160187
ARSON,2002-12-31,2288357434,156,72,1287303,12169.0,15924.0,40922.433732,-85752.010876
ARSON,2003-12-31,2703076345,140,53,1128192,10627.0,19981.0,39863.754410,-83554.299139
ARSON,2004-12-31,2646181864,109,37,992253,9509.0,17373.0,32384.111791,-67867.324658
ARSON,2005-12-31,2850867321,121,40,843632,7930.0,15043.0,28785.106692,-60323.991343
...,...,...,...,...,...,...,...,...,...
WEAPONS VIOLATION,2018-12-31,62064458971,3803,38,5412225,52864.0,108302.0,227616.296667,-477270.082962
WEAPONS VIOLATION,2019-12-31,74471980828,4223,49,6100494,59551.0,122194.0,264977.937719,-555625.593886
WEAPONS VIOLATION,2020-12-31,102154848702,5448,35,8108066,79120.0,163732.0,351911.729617,-737889.451103
WEAPONS VIOLATION,2021-12-31,111493649779,5560,58,8995601,87874.0,180363.0,374011.308683,-784180.519071


In [7]:
ts['Arrest'] = ts['Arrest'].interpolate()

In [8]:
# Resample to monthly
ts = df.resample("Y").mean()

# Select just the 'Primary Type' column from the resampled DataFrame
primary_type_resampled = ts['Primary Type']

# Display the resulting Series
print(primary_type_resampled.head())

  ts = df.resample("Y").mean()


KeyError: 'Primary Type'

In [None]:
# Define the series
ts = ts['ID']
# Plot
ts.plot();

In [None]:
decomp = tsa.seasonal_decompose(ts)
fig = decomp.plot()
fig.set_size_inches(12,5)
fig.tight_layout()

In [None]:
d = ndiffs(df)
print (f'd = {d}')

In [None]:
D = nsdiffs(ts, m =12)
print(f'D = {D}')

In [None]:
ts_diff = ts.diff().dropna()

In [None]:
plot_acf_pacf(ts_diff, annotate_seas=True, m = 12);

In [None]:
train, test = train_test_split(ts, test_size=.25)
ax = train.plot(label='Train')
test.plot(ax=ax, label='Test')
ax.legend();

In [None]:
# Fitting a SARIMA model

# Orders for non seasonal components
p = 1  # nonseasonal AR
d = 1  # nonseasonal differencing
q = 1  # nonseasonal MA

# Orders for seasonal components
P = 1  # Seasonal AR
D = 0  # Seasonal differencing
Q = 1  # Seasonal MA
m = 12 # Seasonal period

sarima = tsa.ARIMA(train, order = (p,d,q), seasonal_order=(P,D,Q,m)).fit()

In [None]:
# Obtain summary of forecast as dataframe
forecast_df = sarima.get_forecast(len(test)).summary_frame()

plot_forecast(train, test, forecast_df)

regression_metrics_ts(test, forecast_df["mean"])

In [None]:
sarima.summary()

In [None]:
fig = sarima.plot_diagnostics()
fig.set_size_inches(10,6)
fig.tight_layout()