# Data Collection

In [1]:
# Retreive the Arrested Dataset from Toronto Open Data API

import requests

base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# Datasets are called "packages". Each package can contain many "resources"
url = base_url + "/api/3/action/package_show"
params = { "id": "police-annual-statistical-report-arrested-and-charged-persons"}
package = requests.get(url, params = params).json()


# To get resource data:
for idx, resource in enumerate(package["result"]["resources"]):

    # For datastore_active resources:
    if resource["datastore_active"]:

        # To get all records in CSV format:
        url = base_url + "/datastore/dump/" + resource["id"]
        try:
            resource_dump_data = requests.get(url).text
            # Success message
            print("Data retrieved successfully.")
        except requests.exceptions.RequestException as e:
            # Error message
            print(f"Error retrieving data: {e}")        

Data retrieved successfully.


In [2]:
# Create pandas Dataframe

import pandas as pd
from io import StringIO

# read csv file into pandas dataframe
csv_file = StringIO(resource_dump_data)
data = pd.read_csv(csv_file)

# Print the DataFrame
data.head()

Unnamed: 0,_id,ARREST_YEAR,DIVISION,HOOD_158,NEIGHBOURHOOD_158,SEX,AGE_COHORT,AGE_GROUP,CATEGORY,SUBTYPE,ARREST_COUNT
0,1,2019,D14,83,Dufferin Grove (83),Female,25 to 34,Adult,Other Criminal Code Violations,Other,1
1,2,2022,D12,30,Brookhaven-Amesbury (30),Male,<18,Youth,Crimes Against the Person,Assaults,2
2,3,2018,D14,165,Harbourfront-CityPlace (165),Male,18 to 24,Adult,Other Criminal Code Violations,Other,1
3,4,2015,D22,18,New Toronto (18),Male,25 to 34,Adult,Controlled Drugs and Substances Act,Other,3
4,5,2014,D52,78,Kensington-Chinatown (78),Male,25 to 34,Adult,Other Criminal Code Violations,Other,46


# Exploratory data analysis (EDA)

In [3]:
# Retrieve list of columns, their data types, and information about missing values.
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129374 entries, 0 to 129373
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   _id                129374 non-null  int64 
 1   ARREST_YEAR        129374 non-null  int64 
 2   DIVISION           129374 non-null  object
 3   HOOD_158           129374 non-null  object
 4   NEIGHBOURHOOD_158  129374 non-null  object
 5   SEX                129374 non-null  object
 6   AGE_COHORT         129374 non-null  object
 7   AGE_GROUP          129374 non-null  object
 8   CATEGORY           129374 non-null  object
 9   SUBTYPE            129374 non-null  object
 10  ARREST_COUNT       129374 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 10.9+ MB


In [4]:
# Summary of statistics for the DataFrame columns.
data.describe()

Unnamed: 0,_id,ARREST_YEAR,ARREST_COUNT
count,129374.0,129374.0,129374.0
mean,64687.5,2017.840184,4.658579
std,37347.2012,2.57916,9.880257
min,1.0,2014.0,1.0
25%,32344.25,2016.0,1.0
50%,64687.5,2018.0,2.0
75%,97030.75,2020.0,4.0
max,129374.0,2022.0,494.0


In [5]:
# Print the number of rows and columns
print("the dimension:", data.shape)

the dimension: (129374, 11)


In [6]:
# Selects the values of the first row of DataFrame
data_values = data.iloc[0]
data_values

_id                                               1
ARREST_YEAR                                    2019
DIVISION                                        D14
HOOD_158                                         83
NEIGHBOURHOOD_158               Dufferin Grove (83)
SEX                                          Female
AGE_COHORT                                 25 to 34
AGE_GROUP                                     Adult
CATEGORY             Other Criminal Code Violations
SUBTYPE                                       Other
ARREST_COUNT                                      1
Name: 0, dtype: object

In [7]:
# Print unique values for categorical values
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    unique_count = data[col].nunique()
    print("Unique count of column '{}': {}".format(col, unique_count))

Unique count of column 'DIVISION': 17
Unique count of column 'HOOD_158': 159
Unique count of column 'NEIGHBOURHOOD_158': 159
Unique count of column 'SEX': 3
Unique count of column 'AGE_COHORT': 8
Unique count of column 'AGE_GROUP': 3
Unique count of column 'CATEGORY': 7
Unique count of column 'SUBTYPE': 17


###### We can observe that this dataset is aggregated; thus, minor adjustments will be necessary.

In [8]:
# Show unique values of the 'ARREST_YEAR' column
data['ARREST_YEAR'].value_counts().sort_index()

ARREST_YEAR
2014    15484
2015    15138
2016    15609
2017    15350
2018    14620
2019    13658
2020    12782
2021    12912
2022    13821
Name: count, dtype: int64

# Data Cleaning and Transformation

In [9]:
# Get count and percentage of Null Values

data1 = data.copy()

data_null_count = data1.isnull().sum()
data_null_percentage = (data1.isnull().sum() / data1.shape[0]) * 100
null_values = pd.DataFrame({
    'Null Count': data_null_count,
    'Null Percentage': data_null_percentage
})
null_values

Unnamed: 0,Null Count,Null Percentage
_id,0,0.0
ARREST_YEAR,0,0.0
DIVISION,0,0.0
HOOD_158,0,0.0
NEIGHBOURHOOD_158,0,0.0
SEX,0,0.0
AGE_COHORT,0,0.0
AGE_GROUP,0,0.0
CATEGORY,0,0.0
SUBTYPE,0,0.0


###### Dataset is allready fairly clean. 


In [12]:
# Save new DataFrame to an Excel file
data1.to_excel('../Data/TOR_arrested.xlsx', index=False) 