# imports

In [1]:
import requests
import pandas as pd

# acquire data

In [2]:
# note that this had to strip the last two parameters from the api url in the google doc
# also upped the limit 
api_url = "https://opendata.maryland.gov/api/id/crti-ybyp.json?$select=*&$order=`:id`+ASC&$limit=3000&$offset=0 "
response = requests.get(api_url)
data = response.json()

In [3]:
data[0:2]

[{'sno': '1',
  'complaint': '232216',
  'complaint_description': 'Odor Complaint',
  'complaint_type': 'Odor',
  'recieved_date': '2024-03-01',
  'incident_date': '2024-03-01',
  'county': 'Dorchester',
  'incident_closed_date': '2024-03-01',
  'incident_status_desc': 'Incident Closed - No further action',
  'incident_zip': '21835'},
 {'sno': '2',
  'complaint': '232215',
  'complaint_description': 'Odor Complaint',
  'complaint_type': 'Odor',
  'recieved_date': '2024-03-01',
  'incident_date': '2024-03-01',
  'county': 'Dorchester',
  'incident_closed_date': '2024-03-01',
  'incident_status_desc': 'Incident Closed - No further action',
  'incident_zip': '21835'}]

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,sno,complaint,complaint_description,complaint_type,recieved_date,incident_date,county,incident_closed_date,incident_status_desc,incident_zip
0,1,232216,Odor Complaint,Odor,2024-03-01,2024-03-01,Dorchester,2024-03-01,Incident Closed - No further action,21835
1,2,232215,Odor Complaint,Odor,2024-03-01,2024-03-01,Dorchester,2024-03-01,Incident Closed - No further action,21835
2,3,232214,Odor Complaint,Odor,2024-02-26,2024-02-26,Dorchester,2024-02-26,Incident Closed - No further action,21835
3,4,232205,Neighbor's fireplace is producing large amount...,"Air, Smoke",2024-02-29,2024-02-29,Carroll,,Referred to Outside Agency,
4,5,232204,Concern of loud boom noise in the Cheverly area,Other,2024-02-28,2024-02-28,Prince George's,2024-03-01,Incident Closed-Managed,
...,...,...,...,...,...,...,...,...,...,...
1502,1503,8058,Boat company sandblasting boats and blowing wh...,Fugitive Dust/Particulate Matter,2021-01-11,2021-01-11,Queen Anne's,2022-03-08,Incident Closed - No further action,
1503,1504,8057,Smoke and ash from neighbor's open burning in ...,Air,2021-01-07,2021-01-07,Anne Arundel,2021-01-07,Incident Closed-No Violation Observed,20751
1504,1505,8056,Smoke from neighbor's wood burning stove.,Air,2021-01-07,2021-01-04,Charles,,Under Investigation,
1505,1506,8051,Concern of materials being burned in fireplace...,Air,2021-01-04,2020-12-28,Frederick,2021-01-15,Incident Closed-No Violation Observed,21770


# clean data

In [6]:
# Before we do anything else, let's create a backup dataframe that we'll never manipulate.
backup_df = df.copy()

In [9]:
# # Let's look at the data types for each column.
# df.dtypes

# That's a lot of "Objects" (meaning strings)! Let's convert the 'received date' to actual dates.
df['recieved_date'] = pd.to_datetime(df['recieved_date'])

# While we're at it, how about we do the same for the two other dates?
df['incident_date'] = pd.to_datetime(df['incident_date'])
df['incident_closed_date'] = pd.to_datetime(df['incident_closed_date'])

sno                              object
complaint                        object
complaint_description            object
complaint_type                   object
recieved_date            datetime64[ns]
incident_date            datetime64[ns]
county                           object
incident_closed_date     datetime64[ns]
incident_status_desc             object
incident_zip                     object
dtype: object

# analyze data

In [31]:
# For the summaries we're running, we'll limit that to just complaints received in the last year?
year_ago_date = pd.Timestamp('2023-03-09')

last_year_df = df[df["recieved_date"] > year_ago_date].copy()

last_year_df["county"].value_counts()

county
Baltimore City        101
Anne Arundel           82
Prince George's        75
Frederick              62
Baltimore              57
Montgomery             35
Cecil                  25
Dorchester             17
Harford                17
Howard                 15
Allegany               15
Washington             14
Charles                12
Carroll                 9
Worcester               7
Not Yet Determined      7
Wicomico                7
St. Mary's              6
Garrett                 5
Caroline                3
Somerset                2
Queen Anne's            2
Kent                    1
Statewide               1
Talbot                  1
Name: count, dtype: int64

In [32]:
# What are the most common types of complaint?
# last_year_df["complaint_type"]
last_year_df["complaint_type"].value_counts()

complaint_type
Air                                                                                 163
Odor                                                                                140
Other                                                                               113
Fugitive Dust/Particulate Matter                                                     35
Smoke                                                                                28
Air, Fumes, Odor                                                                     20
Open Burning                                                                         12
Fumes                                                                                11
Air, Odor                                                                            11
Open Burning, Smoke                                                                   7
Air, Odor, Smoke                                                                      4
Air, Fugitive Dus

In [38]:
# Wait, though. The second line has a type of "Air, Other" — shouldn't that count toward both categories?
# The data's bunched up, so we need to make the table longer.
# (Important caveat to be thinking what each row represents — you could fall into a trap with this transformation.)
last_year_df["complaint_type"] = last_year_df["complaint_type"].str.split(', ')

Unnamed: 0,sno,complaint,complaint_description,complaint_type,recieved_date,incident_date,county,incident_closed_date,incident_status_desc,incident_zip
0,1,232216,Odor Complaint,Odor,2024-03-01,2024-03-01,Dorchester,2024-03-01,Incident Closed - No further action,21835.0
1,2,232215,Odor Complaint,Odor,2024-03-01,2024-03-01,Dorchester,2024-03-01,Incident Closed - No further action,21835.0
2,3,232214,Odor Complaint,Odor,2024-02-26,2024-02-26,Dorchester,2024-02-26,Incident Closed - No further action,21835.0
3,4,232205,Neighbor's fireplace is producing large amount...,Air,2024-02-29,2024-02-29,Carroll,NaT,Referred to Outside Agency,
3,4,232205,Neighbor's fireplace is producing large amount...,Smoke,2024-02-29,2024-02-29,Carroll,NaT,Referred to Outside Agency,
4,5,232204,Concern of loud boom noise in the Cheverly area,Other,2024-02-28,2024-02-28,Prince George's,2024-03-01,Incident Closed-Managed,


In [41]:
# Look at the first five rows.
last_year_df[0:5]

Unnamed: 0,sno,complaint,complaint_description,complaint_type,recieved_date,incident_date,county,incident_closed_date,incident_status_desc,incident_zip
0,1,232216,Odor Complaint,[Odor],2024-03-01,2024-03-01,Dorchester,2024-03-01,Incident Closed - No further action,21835.0
1,2,232215,Odor Complaint,[Odor],2024-03-01,2024-03-01,Dorchester,2024-03-01,Incident Closed - No further action,21835.0
2,3,232214,Odor Complaint,[Odor],2024-02-26,2024-02-26,Dorchester,2024-02-26,Incident Closed - No further action,21835.0
3,4,232205,Neighbor's fireplace is producing large amount...,"[Air, Smoke]",2024-02-29,2024-02-29,Carroll,NaT,Referred to Outside Agency,
4,5,232204,Concern of loud boom noise in the Cheverly area,[Other],2024-02-28,2024-02-28,Prince George's,2024-03-01,Incident Closed-Managed,


In [40]:
# Now, see what it looks like when we run "explode" on these five rows.
last_year_df[0:5].explode("complaint_type")

Unnamed: 0,sno,complaint,complaint_description,complaint_type,recieved_date,incident_date,county,incident_closed_date,incident_status_desc,incident_zip
0,1,232216,Odor Complaint,Odor,2024-03-01,2024-03-01,Dorchester,2024-03-01,Incident Closed - No further action,21835.0
1,2,232215,Odor Complaint,Odor,2024-03-01,2024-03-01,Dorchester,2024-03-01,Incident Closed - No further action,21835.0
2,3,232214,Odor Complaint,Odor,2024-02-26,2024-02-26,Dorchester,2024-02-26,Incident Closed - No further action,21835.0
3,4,232205,Neighbor's fireplace is producing large amount...,Air,2024-02-29,2024-02-29,Carroll,NaT,Referred to Outside Agency,
3,4,232205,Neighbor's fireplace is producing large amount...,Smoke,2024-02-29,2024-02-29,Carroll,NaT,Referred to Outside Agency,
4,5,232204,Concern of loud boom noise in the Cheverly area,Other,2024-02-28,2024-02-28,Prince George's,2024-03-01,Incident Closed-Managed,


In [42]:
# Now, apply this to our full dataset.
complaints_by_type = last_year_df.explode("complaint_type")

In [43]:
# Now we can see how many times each type of complaint was lodged.
last_year_complaint_frequency = complaints_by_type["complaint_type"].value_counts()

last_year_complaint_frequency

# We'll leave the complaints-by-type dataset there for now, but note that we _could_ export one CSV per complaint type,
# if for example that would help reporters on different beats focus on their relevant parts of the same data.

complaint_type
Air                                   220
Odor                                  186
Other                                 118
Smoke                                  56
Fugitive Dust/Particulate Matter       45
Fumes                                  39
Open Burning                           29
Asbestos Complaint                      3
Air Pollutant Release                   2
Noise Complaint                         2
Non-Tidal Wetlands/Waterway             2
Suspected Operation without Permit      1
ARA AQCP non-regulated entity           1
Name: count, dtype: int64

In [50]:
# # There are some `NaN` values in the county column. Let's toss them out from the main dataframe.
# # First, we'll see how many rows are missing one value or the other.
# last_year_df["county"].isna().sum()


# # We know we have just a handful of NAs, but what about others? Let's look at all unique county values first.
# last_year_df["county"].unique()


# It looks like there are three more we should screen out: "Not Yet Determined", "Outside of Maryland" and "Statewide."
# do a new dataframe with only the values we expect.
last_year_df = last_year_df[last_year_df['county'].notnull()]
last_year_df = last_year_df[~last_year_df['county'].isin(["Not Yet Determined", "Outside of Maryland", "Statewide"])]

In [51]:
# # We'll create other dataframes with just those values — and export them to CSV files so that we have the option of looking through those one by one at some point.
null_county_rows = backup_df[backup_df['county'].isnull()]
wrong_county_rows = backup_df[backup_df['county'].isin(["Not Yet Determined", "Outside of Maryland", "Statewide"])]

# Export each one separately.
null_county_rows.to_csv("exported_data/null_counties.csv")
wrong_county_rows.to_csv("exported_data/wrong_counties.csv")

In [52]:
# Let's go back to the "one row = one complaint" dataset, and look at how many complaints have come in from each county.
last_year_by_county = last_year_df["county"].value_counts()

last_year_by_county

county
Baltimore City     101
Anne Arundel        82
Prince George's     75
Frederick           62
Baltimore           57
Montgomery          35
Cecil               25
Harford             17
Dorchester          17
Allegany            15
Howard              15
Washington          14
Charles             12
Carroll              9
Worcester            7
Wicomico             7
St. Mary's           6
Garrett              5
Caroline             3
Queen Anne's         2
Somerset             2
Talbot               1
Kent                 1
Name: count, dtype: int64

# export data

In [53]:
# Let's write out our main dataset as a CSV.
df.to_csv(
    f"exported_data/complaints.csv",
    index=False,
    columns=[
        "complaint",
        "incident_date",
        "county",
        "incident_zip",
        "complaint_type",
        "complaint_description",
        "recieved_date",
        "incident_status_desc",
        "incident_closed_date",
    ]
)


In [54]:
# We've also effectively made a couple of pivot tables (reports per county and frequency of report type). Let's export them, too.
last_year_complaint_frequency.to_csv("exported_data/complaint_frequency.csv")
last_year_by_county.to_csv("exported_data/county_frequency.csv")