# Chicago Noise Complaints ETL

### ETL process

#### Extract
* Import csv file into Pandas DataFrame.

#### Transform
* Drop columns not required.
* Rename a column.
* Add new columns and calculate their values.
* Drop columns no longer required.
* Drop complaints dated earlier than 1/1/16 (keeping only complaints within approximately last 5 years).
* Get zip code, full address, latitude, and longitude from Google Maps geocoding API.
* Drop columns no longer needed.
* Group data by zip code and get count of noise complaints for each zip code.

#### Load
* Export data to PostgreSQL database.

In [1]:
# Import dependencies.
import pandas as pd
import requests
import datetime as dt
import json

# Import credentials from file.

# Google maps geocoding API key.
from credentials import gmaps_key

# Postgres database credentials.
from credentials import pgadmin_username
from credentials import pgadmin_password

## Extract

### Import cvs file into Pandas DataFrame.

In [2]:
# Read in csv file and store as Pandas dataframe.
noise_complaints_df = pd.read_csv("../Resources/Noise_Complaints_chicago.csv")

noise_complaints_df.head(1)

Unnamed: 0,COMPLAINT ID,COMPLAINT TYPE,STREET NUMBER FROM,STREET NUMBER TO,DIRECTION,STREET NAME,STREET TYPE,INSPECTOR,COMPLAINT DATE,COMPLAINT DETAIL,INSPECTION LOG,DATA SOURCE,Modified Date
0,DOECOMP2,Noise Complaint,1,,S,STATE,ST,10,08/23/1993,STREET MUSIC HEAVY NOISE POLLUTION REFERRED TO...,MORE INFORMATION MAY BE AVAILABLE IN THE CDPH ...,HISTORIC DEPT. OF ENVIRONMENT,01/01/2012


## Transform

### Drop columns not required.

In [3]:
# Drop COMPLAINT ID, COMPLAINT TYPE, STREET NUMBER TO, INSPECTOR, INSPECTION LOG, DATA SOURCE, and Modified Date columns.
(noise_complaints_df.drop(columns=['COMPLAINT ID', 'COMPLAINT TYPE', 'STREET NUMBER TO', 'INSPECTOR', 'COMPLAINT DETAIL', 
                                   'INSPECTION LOG', 'DATA SOURCE', 'Modified Date'], inplace=True))

noise_complaints_df.head(1)

Unnamed: 0,STREET NUMBER FROM,DIRECTION,STREET NAME,STREET TYPE,COMPLAINT DATE
0,1,S,STATE,ST,08/23/1993


### Rename a column.
#### Column names that may be exported to PostgreSQL database in later ETL step are formatted in lower-case and with underscores to comply with PostgreSQL import requirements.

In [4]:
# Rename STREET NUMBER FROM column to simply STREET NUMBER for readability as this column will be used to represent 
# point locations rather than ranges.

noise_complaints_df.rename(columns = {'STREET NUMBER FROM':'street_number'}, inplace = True)

noise_complaints_df.head(1)

Unnamed: 0,street_number,DIRECTION,STREET NAME,STREET TYPE,COMPLAINT DATE
0,1,S,STATE,ST,08/23/1993


### Add new columns and calculate their values.

In [5]:
# Add columns to store formatted date, zip code, full address, latitude, and longitude.

# Add DATE column derived from COMPLAINT DATE column. This column holds dates in Pandas DateTime format and 
# will be used to select noise complaints within recent years.
noise_complaints_df["date"] = pd.to_datetime(noise_complaints_df["COMPLAINT DATE"])

# Add ZIP CODE column. Will be populated with results of API call and will be used to join PostgresSQL tables.
noise_complaints_df["zip_code"] = ""

# Add CITY column needed for Google Maps API call.
noise_complaints_df["city"] = "Chicago"

# Add STATE column needed for Google Maps API call.
noise_complaints_df["state"] = "IL"

# Add partial address column and compute contents from existing columns. This column will be sent to Google Maps API call.
noise_complaints_df["partial_address"] = (noise_complaints_df["street_number"].map(str)+" "+noise_complaints_df["DIRECTION"]
                                          +" "+noise_complaints_df['STREET NAME']+" "+noise_complaints_df['STREET TYPE']
                                          +", "+noise_complaints_df['city']+", "+noise_complaints_df['state'])

# Add NUM COMPLAINTS column to be filled in later groupby/aggregation step.
noise_complaints_df["num_complaints"] = ""

noise_complaints_df.head(1)

Unnamed: 0,street_number,DIRECTION,STREET NAME,STREET TYPE,COMPLAINT DATE,date,zip_code,city,state,partial_address,num_complaints
0,1,S,STATE,ST,08/23/1993,1993-08-23,,Chicago,IL,"1 S STATE ST, Chicago, IL",


### Drop complaint date and address component columns as they are no longer needed.

In [7]:
# Drop COMPLAINT DATE, STREET NUMBER, DIRECTION, STREET NAME, STREET TYPE, CITY, and STATE columns.
(noise_complaints_df.drop(columns=['COMPLAINT DATE', 'street_number', 'DIRECTION', 'STREET NAME', 'STREET TYPE', 'city', 
                                   'state'], inplace=True))

noise_complaints_df.head(0)

Unnamed: 0,date,zip_code,partial_address,num_complaints


### Drop complaints dated earlier than 1/1/2016, keeping only complaints within approximately the last 5 years.

In [23]:
# Format 1/1/2016 as datetime object so it can be used in date comparison step.
date = dt.datetime.strptime('1/1/2016', '%m/%d/%Y')

# Select rows in which FORMATTED DATE is >= 1/1/2016. Store results in new DataFrame.
last_5_years_df = noise_complaints_df.loc[noise_complaints_df["date"] >= date]

# Sort ascending and print to check that prior dates were successfully removed.
last_5_years_df.sort_values("date")

# View row count to compare with row count following geocoding step.
print(last_5_years_df)

           date zip_code                     partial_address num_complaints
195  2016-01-25                1456 N DAYTON ST, Chicago, IL               
196  2016-03-10                 19 S WABASH AVE, Chicago, IL               
197  2016-05-26             1500 N CLYBOURN AVE, Chicago, IL               
198  2016-06-16                640 N WABASH AVE, Chicago, IL               
199  2016-06-28              1035 N DEARBORN ST, Chicago, IL               
...         ...      ...                                 ...            ...
9235 2020-01-02               3631 N HALSTED ST, Chicago, IL               
9236 2020-07-20           5100 N RAVENSWOOD AVE, Chicago, IL               
9237 2020-11-16            4335 W KAMERLING AVE, Chicago, IL               
9238 2020-02-04                1743 N MOZART ST, Chicago, IL               
9239 2020-03-12                  1756 W 18TH ST, Chicago, IL               

[1152 rows x 4 columns]


### Get zip code from Google Maps geocoding API.

In [31]:
test_df = last_5_years_df.head(50)
test_df

Unnamed: 0,date,zip_code,partial_address,num_complaints
195,2016-01-25,60642.0,"1456 N DAYTON ST, Chicago, IL",
196,2016-03-10,60603.0,"19 S WABASH AVE, Chicago, IL",
197,2016-05-26,60610.0,"1500 N CLYBOURN AVE, Chicago, IL",
198,2016-06-16,60611.0,"640 N WABASH AVE, Chicago, IL",
199,2016-06-28,60610.0,"1035 N DEARBORN ST, Chicago, IL",
200,2016-07-06,60610.0,"1340 N ASTOR ST, Chicago, IL",
201,2016-07-25,60633.0,"12600 S TORRENCE AVE, Chicago, IL",
202,2016-08-03,60640.0,"1470 W RASCHER AVE, Chicago, IL",
203,2016-09-13,60612.0,"2201 W SUPERIOR ST, Chicago, IL",
204,2016-10-31,60611.0,"105 E DELAWARE PL, Chicago, IL",


In [32]:
for index, row in test_df.iterrows():
    
    try:
        
        # Send partial addresses through Google Maps geocoding api call.
        address = row["partial_address"]
        url = ('https://maps.googleapis.com/maps/api/geocode/json?address={0}&key={1}').format(address, gmaps_key)
        response = requests.get(url).json()
        
        # Extract zip code from json.
        zip_code = response["results"][0]["address_components"][7]["short_name"]
        
        # Store zip code in appropriate DataFrame cell.
        test_df.at[index, "zip_code"] = zip_code
    
    # Index errors occur in API call when 
    except(IndexError): pass
    
print(test_df)

          date zip_code                     partial_address num_complaints
195 2016-01-25    60642       1456 N DAYTON ST, Chicago, IL               
196 2016-03-10    60603        19 S WABASH AVE, Chicago, IL               
197 2016-05-26    60610    1500 N CLYBOURN AVE, Chicago, IL               
198 2016-06-16    60611       640 N WABASH AVE, Chicago, IL               
199 2016-06-28    60610     1035 N DEARBORN ST, Chicago, IL               
200 2016-07-06    60610        1340 N ASTOR ST, Chicago, IL               
201 2016-07-25    60633   12600 S TORRENCE AVE, Chicago, IL               
202 2016-08-03    60640     1470 W RASCHER AVE, Chicago, IL               
203 2016-09-13    60612     2201 W SUPERIOR ST, Chicago, IL               
204 2016-10-31    60611      105 E DELAWARE PL, Chicago, IL               
205 2016-11-29    60609         1133 W 35TH ST, Chicago, IL               
206 2017-02-14    60611     626 N MICHIGAN AVE, Chicago, IL               
207 2017-03-09    60622  

In [33]:
# Loop over last_5_years_df and make calls to the Google Maps geocoding API to retreive zip code.

for index, row in last_5_years_df.iterrows():
    
    try:
        
        # Send partial addresses through Google Maps geocoding api call.
        address = row["partial_address"]
        url = ('https://maps.googleapis.com/maps/api/geocode/json?address={0}&key={1}').format(address, gmaps_key)
        response = requests.get(url).json()
        
        # Extract zip code from API response.
        zip_code = response["results"][0]["address_components"][7]["short_name"]
        
        # Store zip code in appropriate DataFrame cell.
        last_5_years_df.at[index, "zip_code"] = zip_code

    # Index errors occur in the parsing step when the API returns an empty json. This occurs when a partial_address
    # sent in the API call is either blank or has typos. It also occurs when a ligitimate partial_address is sent
    # but doesn't exist in the Google Maps database.Excepting these errors results in 30 blank rows in the DataFrame 
    # which we remove in a later step.
    except(IndexError): pass

In [35]:
# Print DataFrame after geocoding step to ensure that zip code was filled in and all rows are still present.
print(last_5_years_df)

           date zip_code                     partial_address num_complaints
195  2016-01-25    60642       1456 N DAYTON ST, Chicago, IL               
196  2016-03-10    60603        19 S WABASH AVE, Chicago, IL               
197  2016-05-26    60610    1500 N CLYBOURN AVE, Chicago, IL               
198  2016-06-16    60611       640 N WABASH AVE, Chicago, IL               
199  2016-06-28    60610     1035 N DEARBORN ST, Chicago, IL               
...         ...      ...                                 ...            ...
9235 2020-01-02    60613      3631 N HALSTED ST, Chicago, IL               
9236 2020-07-20    60640  5100 N RAVENSWOOD AVE, Chicago, IL               
9237 2020-11-16    60651   4335 W KAMERLING AVE, Chicago, IL               
9238 2020-02-04    60647       1743 N MOZART ST, Chicago, IL               
9239 2020-03-12    60608         1756 W 18TH ST, Chicago, IL               

[1152 rows x 4 columns]


### Drop columns no longer needed.

In [36]:
# Drop all columns except zip_code and num_complaints.
last_5_years_df = last_5_years_df[['zip_code', 'num_complaints']]

last_5_years_df.head()

Unnamed: 0,zip_code,num_complaints
195,60642,
196,60603,
197,60610,
198,60611,
199,60610,


### Aggregate data to zip codes.

In [37]:
# Group data by zip code and compute number of complaints in each zip code.
gb_df = last_5_years_df.groupby('zip_code').agg({"num_complaints": "count"}).reset_index()

gb_df.head()

Unnamed: 0,zip_code,num_complaints
0,,30
1,60601.0,7
2,60602.0,4
3,60603.0,4
4,60604.0,1


### Check for typos and missing data

In [38]:
# Sort zip_df to check for typos and missing data. 3 errors are present: a blank, '10301', and 'US'.
gb_df.sort_values("zip_code")

gb_df

Unnamed: 0,zip_code,num_complaints
0,,30
1,60601,7
2,60602,4
3,60603,4
4,60604,1
5,60605,54
6,60606,5
7,60607,39
8,60608,33
9,60609,31


### Remove 2 rows found to have errors.

In [41]:
# Filter out any ZIP CODE entries that don't have 5 digits to remove the blank and the 'US'.
final_df = gb_df[gb_df['zip_code'].str.len() == 5]

# View data to make sure all typos and missing data are cleared. After removing blanks and 'US', 58 zip codes remain.
final_df

Unnamed: 0,zip_code,num_complaints
1,60601,7
2,60602,4
3,60603,4
4,60604,1
5,60605,54
6,60606,5
7,60607,39
8,60608,33
9,60609,31
10,60610,59


## Load

### Export transformed Pandas DataFrame to csv file.

In [50]:
final_df.to_csv("Noise_complaint_counts_by_zip.csv", index = False)