## 311 Data API Call

In [37]:
# Import Modules
import requests as rq
import json
from pprint import pprint
from sodapy import Socrata
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

In [38]:
load_dotenv('./.env')
pkey = os.getenv('pkey')
engine = create_engine(f'postgresql://postgres:{pkey}@localhost:5432/etl')

In [82]:
zipcode_df = pd.read_csv('../ryan/Resources/zipcodes.csv').drop('Unnamed: 0', axis=1)
zipcode_df['zipcode'] = zipcode_df['zipcode'].astype('str')

In [89]:
def matchZipID(df, colname):
    ids = []
    for zipcode in df[colname]:
        for i, code in enumerate(zipcode_df.zipcode):
            if zipcode == code:
                ids.append(zipcode_df.iloc[i, 0].round())
#                 print(zipcode_df.iloc[i, 0].round())
    return pd.DataFrame(ids)

In [41]:
# Build the endpoint URL
url = "https://data.cityofnewyork.us/resource/erm2-nwe9.json?"

In [42]:
response = Socrata("data.cityofnewyork.us", None)



In [44]:
# Run a request to endpoint and convert result to pretty-printed json
results = response.get("erm2-nwe9", limit=10)

In [45]:
# Pretty-print Json Results
# pprint(results)

In [46]:
# Filter to only records created between January 1st 2020 and February 12th 2021 and use limit to override default 1000 records
complaints = response.get("erm2-nwe9", select="*", where = "created_date between '2020-01-01T20:00:00' and '2021-02-13T23:59:59'", limit=200000)

In [131]:
# Convert Json to dataframe
complaints_df = pd.DataFrame.from_records(complaints)

In [132]:
# Show length of dataframe to display 1k default was bypassed
len(complaints_df)

200000

## Clean the 311 Dataset

In [133]:
# Display first five records

# Droping 311 calls without a zip code listed
complaints_df = complaints_df[complaints_df['incident_zip'].notna()]
complaints_df['incident_zip'] = matchZipID(complaints_df, 'incident_zip')
complaints_df = complaints_df[complaints_df['incident_zip'].notna()]
complaints_df['incident_zip'] = complaints_df['incident_zip'].apply(round)
complaints_df = complaints_df.rename(columns={'incident_zip': 'zipcodeID'})
complaints_df

Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,zipcodeID,intersection_street_1,intersection_street_2,...,landmark,facility_type,bridge_highway_name,bridge_highway_direction,road_ramp,bridge_highway_segment,taxi_pick_up_location,due_date,vehicle_type,taxi_company_borough
0,45287028,2020-01-01T20:00:00.000,2020-01-02T21:00:00.000,DOT,Department of Transportation,Street Light Condition,Street Light Out,56,SHAKESPEARE AVENUE,CROSS BRONX EXPRESSWAY,...,,,,,,,,,,
1,45286187,2020-01-01T20:00:14.000,2020-01-03T14:18:19.000,HPD,Department of Housing Preservation and Develop...,HEAT/HOT WATER,APARTMENT ONLY,112,,,...,,,,,,,,,,
2,45286499,2020-01-01T20:00:17.000,2020-01-02T06:50:41.000,NYPD,New York City Police Department,Noise - Residential,Loud Music/Party,36,MACOMBS DAM BRDG PEDESTRIAN PATH,HARLEM RIVER DRIVE,...,8 AVENUE,,,,,,,,,
3,45288727,2020-01-01T20:00:37.000,2020-01-04T02:12:51.000,HPD,Department of Housing Preservation and Develop...,HEAT/HOT WATER,APARTMENT ONLY,97,,,...,,,,,,,,,,
4,45286892,2020-01-01T20:01:43.000,2020-01-02T05:38:31.000,NYPD,New York City Police Department,Illegal Parking,Blocked Hydrant,147,71 STREET,72 STREET,...,CALDWELL AVENUE,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191568,45556318,2020-02-06T09:36:44.000,2020-02-06T09:36:44.000,DOHMH,Department of Health and Mental Hygiene,Rodent,Rat Sighting,60,CENTRAL PARK WEST,COLUMBUS AVENUE,...,WEST 71 STREET,,,,,,,,,
191569,45557763,2020-02-06T09:36:58.000,2020-02-07T05:09:26.000,NYPD,New York City Police Department,Blocked Driveway,Partial Access,70,139 AVENUE,141 AVENUE,...,229 STREET,,,,,,,,,
191570,45556667,2020-02-06T09:37:00.000,2020-02-06T09:51:00.000,DEP,Department of Environmental Protection,Water System,No Water (WNW),57,,,...,,,,,,,,,,
191571,45558955,2020-02-06T09:37:00.000,2020-02-06T09:53:00.000,DEP,Department of Environmental Protection,Water System,No Water (WNW),88,,,...,,,,,,,,,,


In [134]:
# Display columns of dataframe
column_length_1 = len(complaints_df.columns)
print(complaints_df.columns)
print(f'-----------------------------')
print(f'The 311 dataset has {column_length_1} columns')

Index(['unique_key', 'created_date', 'closed_date', 'agency', 'agency_name',
       'complaint_type', 'descriptor', 'zipcodeID', 'intersection_street_1',
       'intersection_street_2', 'address_type', 'city', 'status',
       'resolution_description', 'resolution_action_updated_date',
       'community_board', 'borough', 'x_coordinate_state_plane',
       'y_coordinate_state_plane', 'open_data_channel_type',
       'park_facility_name', 'park_borough', 'latitude', 'longitude',
       'location', 'location_type', 'incident_address', 'street_name', 'bbl',
       'cross_street_1', 'cross_street_2', 'landmark', 'facility_type',
       'bridge_highway_name', 'bridge_highway_direction', 'road_ramp',
       'bridge_highway_segment', 'taxi_pick_up_location', 'due_date',
       'vehicle_type', 'taxi_company_borough'],
      dtype='object')
-----------------------------
The 311 dataset has 41 columns


#### Remove columns

* 'x_coordinate_state_plane'
* 'y_coordinate_state_plane'
* 'landmark'
* 'facility_type'
* 'bbl'
* 'due_date'
* 'park_borough'
* 'taxi_pick_up_location'
* 'bridge_highway_segment'
* 'vehicle_type'
* 'taxi_company_borough'
* 'road_ramp'
* 'bridge_highway_direction'
* 'bridge_highway_name'
* 'park_facility_name'


In [135]:
# Determine counts of NaN values in specific columns to determine which columns to drop
#complaints_df["column_name"].isna().sum()

drop_columns_01 = ['location', 'x_coordinate_state_plane', 'status','y_coordinate_state_plane', 'address_type', 'landmark', 'facility_type', 'bbl', 'due_date', 'park_borough', 'taxi_pick_up_location', 'bridge_highway_segment','vehicle_type', 'taxi_company_borough', 'road_ramp', 'bridge_highway_direction', 'bridge_highway_name', 'park_facility_name']
complaints_df = complaints_df.drop(labels=drop_columns_01, axis=1)

column_length_2 = len(complaints_df.columns)
print(f'After removing unnecessary columns the 311 dataset has {column_length_2} columns')

After removing unnecessary columns the 311 dataset has 23 columns


#### Merge columns


In [136]:
# Migrate all data into 'intersection_street_1' from 'cross_street_1 to account for missing values
complaints_df[['intersection_street_1','cross_street_1']] 
complaints_df['intersection_street_1'] = complaints_df['intersection_street_1'].fillna(complaints_df['cross_street_1'])

# Migrate all data into 'intersetction_street_2' from 'cross_street_2' to account for missing values
complaints_df[['intersection_street_2','cross_street_2']] 
complaints_df['intersection_street_2'] = complaints_df['intersection_street_2'].fillna(complaints_df['cross_street_2'])

# Drop duplicate cross-street columns
drop_columns_02 = ['cross_street_1', 'cross_street_2']
complaints_df = complaints_df.drop(labels=drop_columns_02, axis=1)

column_length_3 = len(complaints_df.columns)
print(f'After removing the duplicative columns the 311 dataset has {column_length_3} columns')

After removing the duplicative columns the 311 dataset has 21 columns


#### Rename columns


In [137]:
# Rename columns 'incident_zip' to 'zipcode' and 'open_data_channel_type' to 'channel'
complaints_df.rename(columns={'open_data_channel_type': 'channel', 'intersection_street_1': 'cross_street_1', 'intersection_street_2': 'cross_street_2', 'resolution_action_updated_date': 'updated_date'}, inplace=True)


In [138]:
# Display final column list
complaints_df.columns.tolist()

['unique_key',
 'created_date',
 'closed_date',
 'agency',
 'agency_name',
 'complaint_type',
 'descriptor',
 'zipcodeID',
 'cross_street_1',
 'cross_street_2',
 'city',
 'resolution_description',
 'updated_date',
 'community_board',
 'borough',
 'channel',
 'latitude',
 'longitude',
 'location_type',
 'incident_address',
 'street_name']

#### Account for data input method 

In [139]:
# Convert all string values to lowercase
complaints_df = complaints_df.applymap(lambda s:s.lower() if type(s) == str else s)

#### Drop all rows where zipcode is Nan

In [140]:
# Remove the 7k rows that do not have a zipcode specified
complaints_df = complaints_df.dropna(subset=['zipcodeID'])

rows_after_clean = len(complaints_df)
print(f'After removing all rows with a NaN value under zipcode column, there were {rows_after_clean} rows (complaints) remaining')

After removing all rows with a NaN value under zipcode column, there were 184989 rows (complaints) remaining


#### Save cleaned dataframe to csv

In [141]:
complaints_df.to_csv("clean_311.csv")

#### Uploading to Postgres

In [142]:
complaints_df.to_sql(name='complaints', con=engine, if_exists='replace', index=False)