In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime

In [85]:
import dask.dataframe as dd
import dask.multiprocessing
dask.config.set(scheduler='threads')

<dask.config.set at 0x4efd34f28>

In [86]:
# read 311 requests from url

# import io
# import requests
# requests_311_url = "https://data.cityofnewyork.us/api/views/erm2-nwe9/rows.csv?accessType=DOWNLOAD"
# requests_311_content = requests.get(requests_311_url).content

dtypes = {
    "Unique Key" : "str",
    "Created Date": "str",
    "Closed Date": "str",
    "Agency": "category",
    "Agency Name": "str",
    "Complaint Type": "category", 
    "Descriptor": "str", 
    "Incident Zip": "str",
    "Incident Address": "str",
    "Street Name": "str", 
    "Address Type": "category",
    "City": "category",
    "Landmark": "str",
    "Facility Type": "category",
    "Status": "category",
    "Due Date": "str",
    "Resolution Action Updated Date": "str",
    "BBL": "str",
    "Borough": "str",
}

usecols = ['Unique Key', 'Created Date', 'Closed Date', 'Agency', 'Agency Name',
           'Complaint Type', 'Location Type', 'Incident Zip',
           'Incident Address', 'Street Name', 'Address Type', 
           'City', 'Landmark', 'Facility Type', 'Status', 'Due Date',
           'Resolution Action Updated Date', 'BBL', 'Borough']

## for date parsing
# mydateparser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p")

start = time.time()
try:
    # note the usage of dtype and usecols below
    df = dd.read_csv("311_Service_Requests_from_2010_to_Present.csv", dtype=dtypes,
                    usecols=usecols)
    end = time.time()
    print("data read in {:.2f} seconds".format(end-start))
except Exception as e:
    print(str(e))
    

data read in 0.23 seconds


In [87]:
df.columns

Index(['Unique Key', 'Created Date', 'Closed Date', 'Agency', 'Agency Name',
       'Complaint Type', 'Location Type', 'Incident Zip', 'Incident Address',
       'Street Name', 'Address Type', 'City', 'Landmark', 'Facility Type',
       'Status', 'Due Date', 'Resolution Action Updated Date', 'BBL',
       'Borough'],
      dtype='object')

In [48]:
# describe the data. The compute() function is because this is a dask dataframe. 
# Exclude compute() is using pandas dataframe

df.describe().compute()

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Location Type,Incident Zip,Incident Address,Street Name,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Action Updated Date,BBL,Borough
unique,21827375,14349985,8971959,31,1933,434,170,2817,1436297,39964,5,2557,7692,4,12,8038965,8584002,782100,6
count,21827375,21827375,21202729,21827375,21827375,21827375,16385326,20536815,17664695,17663315,20504922,20492776,454992,5503452,21827375,8525510,21511429,16768919,21827375
top,44652957,01/24/2013 12:00:00 AM,11/15/2010 12:00:00 AM,HPD,Department of Housing Preservation and Develop...,Noise - Residential,RESIDENTIAL BUILDING,11226,34 ARDEN STREET,BROADWAY,ADDRESS,BROOKLYN,BROADWAY,Precinct,Closed,04/08/2015 10:00:58 AM,11/15/2010 12:00:00 AM,1021740175,BROOKLYN
freq,1,7650,9289,5902472,5902138,1778029,5842533,380180,14412,204614,16651601,6541439,5573,4757733,20762912,4552,11597,14714,6492836


In [None]:
# getting the shape of the dataframe

a = df.shape
a[0].compute(),a[1]

### Get complaint types for Dpt of Housing

In [88]:
# first filter our dataframe by agency name housing department
df_housing_department = df[df['Agency Name'] == 'Department of Housing Preservation and Development']

In [89]:
# get the first 3 complaint types for housing department

start = time.time()

df_complaint_type = (df_housing_department.groupby('Complaint Type')['Complaint Type']
                     .agg('count').nlargest(3)
                     .compute())

end = time.time()

print("processing time: {:.3f} seconds \n".format(end-start))
print(df_complaint_type)

processing time: 141.751 seconds 

Complaint Type
HEAT/HOT WATER    1176337
HEATING            887850
PLUMBING           705868
Name: Complaint Type, dtype: int64


In [90]:
# note the dtypes of the columns. Note the category columns we specified during dataset read.

df.dtypes

Unique Key                          object
Created Date                        object
Closed Date                         object
Agency                            category
Agency Name                         object
Complaint Type                    category
Location Type                       object
Incident Zip                        object
Incident Address                    object
Street Name                         object
Address Type                      category
City                              category
Landmark                            object
Facility Type                     category
Status                            category
Due Date                            object
Resolution Action Updated Date      object
BBL                                 object
Borough                             object
dtype: object