# Import Libraries

In [18]:
# !pip install humanfriendly
# !pip install datamart-geo
# !pip install datamart-profiler
# !pip install openclean
# !pip install openclean-geo

# General
import os
import gzip
import time
import datetime
import dateutil
import pandas as pd
import re
import humanfriendly

# Data Cleaning
import datamart_geo
import datamart_profiler
import openclean
from openclean.data.source.socrata import Socrata
from openclean import pipeline
from openclean.profiling.column import DefaultColumnProfiler
from openclean.function.eval.base import Eval, Col
from openclean.function.eval.logic import And, Or
from openclean.function.eval.null import IsEmpty
from openclean.function.value.null import is_empty
from openclean_geo.address.usstreet import StandardizeUSStreetName
from openclean.cluster.key import KeyCollision
from openclean.function.value.key.fingerprint import Fingerprint

# Drive
# from google.colab import drive
# drive.mount('/content/gdrive')
# os.chdir("/content/gdrive/MyDrive/Colab Notebooks/Big Data")

# Helper Functions

In [17]:
def is_date(hNo, fuzzy=False):
    try: 
        dateutil.parser.parse(hNo, fuzzy=fuzzy)
        return True
    except ValueError:
        return False


def clean_busType(busType):
    busType = busType.replace("'", "")
    return busType


def clean_sfn_sln(sflName):
    # re.sub(' +', ' ', sflName)
    sflName = sflName.strip("_")
    sflName = sflName.strip("`")
    sflName = sflName.strip("MR.")
    sflName = sflName.strip("\\")
    sflName = sflName.strip("{")
    sflName = sflName.strip(".")
    sflName = sflName.strip("-")
    sflName = sflName.strip(",")
    sflName = sflName.strip("%")
    sflName = sflName.strip("..")
    sflName = sflName.strip("*")
    sflName = sflName.strip("^")
    sflName = sflName.upper()
    return sflName
    

def clean_bName(bName):
    # re.sub(' +', ' ', bName)
    if bName == 'N.A': bName = 'N/A'
    bName = bName.strip("_")
    bName = bName.strip("`")
    bName = bName.strip("MR.")
    bName = bName.strip("\\")
    bName = bName.strip("{")
    bName = bName.strip(".")
    bName = bName.strip("-")
    bName = bName.strip(",")
    bName = bName.strip("%")
    bName = bName.strip("..")
    bName = bName.strip("*")
    bName = bName.strip("^")
    bName = bName.strip("./")
    bName = bName.strip("ï¿½")
    bName = bName.upper()
    return bName
    
    
def clean_fname(fname):
    fname = fname.split(" ")[0]
    fname = fname.split("-")[0]
    fname = fname.split("\\")[0]
    fname = fname.split("_")[0]
    fname = fname.strip("_")
    fname = fname.strip("`")
    fname = fname.strip("MR.")
    fname = fname.strip("\\")
    fname = fname.strip("{")
    fname = fname.strip(".")                                                                      
    fname = fname.strip("-")                                                                    
    fname = fname.strip(",")
    return fname.upper()


def clean_lname(lname):
    lname = lname.split(" ")[0]
    lname = lname.split("-")[0]
    lname = lname.split("\\")[0]
    lname = lname.split("_")[0]
    lname = lname.strip("_")
    lname = lname.strip("`")
    lname = lname.strip("\\")
    lname = lname.strip("{")
    lname = lname.strip(".")                                                                      
    lname = lname.strip("-")                                                                    
    lname = lname.strip(",")
    return lname.upper()


def clean_hNo(hNo):
    hNo = hNo.lstrip('.')
    hNo = hNo.lstrip('//')
    hNo = hNo.lstrip('\\')
    hNo = hNo.lstrip(',')
    hNo = hNo.lstrip('`')
    if is_date(hNo): hNo = "N/A"
    if hNo.isalpha() or len(hNo.split(" ")) > 1 or hNo == "P.O.": hNo = "N/A"
    return hNo


def clean_hZip(hZip):
    if hZip("N/A"): return hZip
    hZip = hZip.strip("'")
    hZip = hZip.strip("_")
    hZip = hZip.strip("`")
    hZip = hZip.strip("\\")
    hZip = hZip.strip("{")
    hZip = hZip.strip(".")                                                                      
    hZip = hZip.strip("-")                                                                    
    hZip = hZip.strip(",")
    if len(hZip.split("-")) > 1: hZip = hZip
    if hZip.isalnum(): hZip = "N/A"    
    else:
        if len(str(int(hZip))) == 5: hZip = str(int(hZip))
        if len(str(int(hZip))) == 9: hZip = str(int(hZip))[:5]+'-'+str(int(hZip))[5:]
        if len(str(int(hZip))) < 5: hZip = '00000'[:5 - len(str(int(hZip)))] + str(int(hZip))
        else: hZip = "N/A"
    return hZip


def clean_phoneNo(pNo):
    length = len(pNo) - pNo.count(" ")
    if(length < 10): pNo = "N/A"
    elif pNo.isspace()== False and len(pNo) == 10: pNo = pNo
    elif pNo.isalpha() or pNo.isalnum(): pNo = "N/A"
    elif len(pNo.split("`")) > 1: pNo = "N/A"
    return pNo


def clean_nta(nta):
    nta = nta.strip('.')
    nta = nta.strip('//')
    nta = nta.strip('\\')
    nta = nta.strip(',')
    nta = nta.strip('`')
    return nta


def clean_number(num):
    num = num.lstrip('0')
    if len(num) == 0: return "N/A"
    return num


def CleanOwner(busType, bName, fName, lName, hNo, hStreet, hCity, hState, hZip, pNo):
    if is_empty(busType) and is_empty(bName) and is_empty(fName) and is_empty(lName) and is_empty(hNo) and is_empty(hStreet) and is_empty(hCity) and is_empty(hState) and is_empty(hZip) and is_empty(pNo): return None, None, None, None, None, None, None, None, None, None
    if is_empty(busType): busType = "N/A"
    if is_empty(bName): bName = "N/A"
    if is_empty(fName): fName = "N/A"
    if is_empty(lName): lName = "N/A"
    if is_empty(hNo): hNo = "N/A"
    if is_empty(hStreet): hStreet = "N/A"
    if is_empty(hCity): hCity = "N/A"
    if is_empty(hState): hState = "N/A"
    if is_empty(hZip): hZip = "N/A"
    if is_empty(pNo): pNo = "N/A"
    # return clean_busType(busType), clean_bName(bName), clean_fname(fName), clean_lname(lName), clean_hNo(hNo), hStreet, hCity, hState, hZip, clean_phoneNo(pNo)
    return busType, bName, fName, lName, hNo, hStreet, hCity, hState, hZip, pNo


def CleanPermitteeSuperindentent(fName, lName, bName, pNo, lType, lNo, actAsSup, sflName, sbName):
    if actAsSup != "Y": actAsSup = "N"
    if is_empty(fName) and is_empty(lName) and is_empty(bName) and is_empty(pNo) and is_empty(lType) and is_empty(lNo):
        fName, lName, bName, pNo, lType, lNo = None, None, None, None, None, None
    else:
        if is_empty(fName): fName = "N/A"
        if is_empty(lName): lName = "N/A"
        if is_empty(bName): bName = "N/A"
        if is_empty(pNo): pNo = "N/A"
        if is_empty(lType): lType = "N/A"
        if is_empty(lNo): lNo = "N/A"
    if is_empty(sflName) and is_empty(sbName):
        sflName, sbName = None, None
    else:
        if is_empty(sflName): sflName = "N/A"
        if is_empty(sbName): sbName = "N/A"
    # if fName == None and sflName == None: return fName, lName, bName, pNo, lType, lNo, actAsSup, sflName, sbName
    # elif fName == None: return fName, lName, bName, pNo, lType, lNo, actAsSup, clean_sfn_sln(sflName), clean_bName(sbName)
    # elif sflName == None: return clean_fname(fName), clean_lname(lName), clean_bName(bName), pNo, lType, lNo, actAsSup, sflName, sbName
    # else: return clean_fname(fName), clean_lname(lName), clean_bName(bName), pNo, lType, lNo, actAsSup, clean_sfn_sln(sflName), clean_bName(sbName)
    return fName, lName, bName, pNo, lType, lNo, actAsSup, sflName, sbName


def CleanGeo(dob, latitude, longitude, council_dist, census_tract, bbl, nta):
    if is_empty(dob): dob = "N/A"
    if is_empty(latitude): latitude = "N/A"
    if is_empty(longitude): longitude = "N/A"
    if is_empty(council_dist): council_dist = "N/A"
    if is_empty(census_tract): census_tract = "N/A"
    if is_empty(bbl): bbl = "NA"
    if is_empty(nta): nta = "NA"
    # return dob, clean_number(latitude), clean_number(longitude), clean_number(council_dist), clean_number(census_tract), bbl, clean_nta(nta)
    return dob, latitude, longitude, council_dist, census_tract, bbl, nta


def cleanTime(dt):
    if is_empty(dt): return "N/A"
    final = dt
    if 'T' in final: final = str(datetime.datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S').date())
    return final


def cleanSiteFill(sf):
    if is_empty(sf) or sf=="NONE": sf = "N/A"
    return sf

# Download Dataset

In [None]:
dataSet = Socrata().dataset("bty7-2jhb")

# By default, we use a small sample of the full dataset
dataFile = "./bty7-2jhb.tsv.gz"


# Download file only if it does not exist already.
if not os.path.isfile(dataFile):
    with gzip.open(dataFile, "wb") as f:
        print("Downloading... ", end="")
        dataSet.write(f)
print("Done!")


fSize = humanfriendly.format_size(os.stat(dataFile).st_size)
print("Using '{}' in file {} of size {}".format(dataSet.name, dataFile, fSize))

Done!
Using 'Historical DOB Permit Issuance' in file ./bty7-2jhb.tsv.gz of size 321.34 MB


# Set the data to stream

In [None]:
ds = pipeline.stream(dataFile)
# Uncomment the following line to see a few rows of the data
# ds.head()

# Profile the columns

In [None]:
dp = ds.profile(default_profiler=DefaultColumnProfiler)
dp.stats()



Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,2428526,0,5,2.058862e-06,2.045161
BIN,2428526,0,300024,0.1235416,16.764061
Number,2428526,4,28639,0.01179277,11.933581
Street,2428526,4,20201,0.008318228,11.223448
Job #,2428526,0,1110544,0.4572914,19.723021
Job doc. #,2428526,0,12,4.941269e-06,0.496506
Job Type,2428526,0,6,2.470634e-06,1.855338
Self_Cert,2428526,1527841,1,1.110266e-06,0.0
Block,2428526,498,13625,0.00561155,12.54555
Lot,2428526,507,1718,0.0007075727,6.659702


# Find columns that are largely empty

In [None]:
# Checking percentage of required columns that are empty
cols = []

for i in range(60):
    print(dp[i]["column"], " is ", (dp[i]["stats"]["emptyValueCount"]/dp[i]["stats"]["totalValueCount"])*100, "%", " empty", sep="")
    cols.append(dp[i]["column"])

BOROUGH is 0.0% empty
BIN is 0.0% empty
Number is 0.0001647089633794326% empty
Street is 0.0001647089633794326% empty
Job # is 0.0% empty
Job doc. # is 0.0% empty
Job Type is 0.0% empty
Self_Cert is 62.91227682964893% empty
Block is 0.02050626594073936% empty
Lot is 0.020876861108343088% empty
Community Board is 0.11743749088953545% empty
Postcode is 0.043236102887101065% empty
Bldg Type is 2.235059455818056% empty
Residential is 68.17938123783728% empty
Special District 1 is 92.31665627627623% empty
Special District 2 is 99.89133326141042% empty
Work Type is 18.693190849099413% empty
Permit Status is 0.3666833297234619% empty
Filing Status is 0.0% empty
Permit Type is 4.117724084485815e-05% empty
Permit Sequence # is 0.0% empty
Permit Subtype is 41.60107818487428% empty
Oil Gas is 98.72210550762067% empty
Site Fill is 6.905546821405248% empty
Filing Date is 0.0% empty
Issuance Date is 0.0% empty
Expiration Date is 0.0001647089633794326% empty
Job Start Date is 0.00012353172253457446% 

# Drop the columns that are largely empty
We Observe that the columns - Self_Cert, Residential, Special District 1, Special District 2, Oil Gas, Permittee's Other Title, HIC License, Site Safety Mgr's First Name, Site Safety Mgr's Last Name, Site Safety Mgr Business Name and Non-Profit are largely empty so we drop them

In [None]:
garbageCols = ["Self_Cert",
               "Residential",
               "Special District 1",
               "Special District 2",
               "Oil Gas",
               "Permittee's Other Title",
               "HIC License",
               "Site Safety Mgr's First Name",
               "Site Safety Mgr's Last Name",
               "Site Safety Mgr Business Name",
               "Non-Profit"]

cols = [col for col in cols if col not in garbageCols]
ds = ds.select(cols)

# Cleaning the columns that have data related to Owner, Permittee and Superintendent

In [14]:
ds = ds.delete(Or(IsEmpty("Permittee's First Name"), IsEmpty("Permittee's Last Name"), IsEmpty("Permittee's Business Name"), IsEmpty("Permittee's Phone #"), IsEmpty("Permit Status")))


ds = ds.update(
    ["Owner's Business Type",
     "Owner's Business Name",
     "Owner's First Name",
     "Owner's Last Name",
     "Owner's House #",
     "Owner's House Street Name",
     "Owner’s House City",
     "Owner’s House State",
     "Owner’s House Zip Code",
     "Owner's Phone #"],
     lambda a, b, c, d, e, f, g, h, i, j: CleanOwner(a, b, c, d, e, f, g, h, i, j))


ds = ds.update(
    ["Permittee's First Name",
     "Permittee's Last Name",
     "Permittee's Business Name",
     "Permittee's Phone #",
     "Permittee's License Type",
     "Permittee's License #",
     "Act as Superintendent",
     "Superintendent First & Last Name",
     "Superintendent Business Name"],
     lambda a, b, c, d, e, f, g, h, i: CleanPermitteeSuperindentent(a, b, c, d, e, f, g, h, i))


ds = ds.update(
    ["DOBRunDate",
     "Latitude",
     "Longitude",
     "Council District",
     "Census Tract",
     "BBL",
     "NTA"],
     lambda a, b, c, d, e, f, g: CleanGeo(a, b, c, d, e, f, g))


for dt in ["Filing Date", "Issuance Date", "Expiration Date", "Job Start Date"]: ds = ds.update([str(dt)], lambda a: cleanTime(a))


ds = ds.update("Site Fill", lambda a: cleanSiteFill(a))

# Converting the Stream to DataFrame for further Cleaning

In [None]:
#Converting to dataframe object
df_full = ds.to_df()

In [16]:
#Removing Outliers and Empty values
cols = df_full.columns

print(df_full.shape)

for i in range(0,11):
  null_count = df_full[df_full[cols[i]]==''].shape[0]
  if null_count != 0:
    df_full[cols[i]].replace("","N/A",inplace=True)
    #Remove empty row entries corresponding to columns Street, Number, Block, Lot
    if cols[i] in ['Street','Number','Block','Lot']:
      df_full = df_full.loc[df_full[cols[i]]!="N/A"]

print(df_full.shape)

#Removing Outlier for postcode
postcode_df = df_full['Postcode'].sort_values(ascending=True)
print(postcode_df)
df_full['Postcode'].replace(0.0, "N/A", inplace=True)

(2428017, 49)
(2428017, 49)
1793647      0
2268795      0
2224262      0
2139745      0
2137997      0
          ... 
2388366    N/A
1939902    N/A
40335      N/A
1926429    N/A
1158903    N/A
Name: Postcode, Length: 2428017, dtype: object


# Applying Key Collision Clustering on Street Data

In [None]:
def print_k_clusters(clusters, k=5):
    #clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
    val_count = sum([len(c) for c in clusters])
    print('Total number of clusters is {} with {} values'.format(len(clusters), val_count))
    for i in range(min(k, len(clusters))):
        print('\nCluster {}'.format(i + 1))
        for key, cnt in clusters[i].items():
            if key == '':
                key = "''"
            print(f'  {key} (x {cnt})')

#Checking Street data for different representations of similar streets.

start_parse = time.perf_counter()

streets = ds.select('Street').distinct_values('Street')

end_parse = time.perf_counter()

print('Parse time {:0.4f} sec. ({} streets)'.format(end_parse - start_parse, len(streets)))

f = StandardizeUSStreetName(characters='upper', alphanum=True, repeated=False)

for threads in range(1,5):
    start_std = time.perf_counter()
    streets_std = f.apply(streets, threads=threads)
    count = len(streets_std)
    end_std = time.perf_counter()
    exec_time = end_std - start_std
    print('Standardization time (using {} threads) {:0.4f} sec. ({} streets)'.format(threads, exec_time, count))

for threads in range(1,5):
    f = KeyCollision(func=Fingerprint(), threads=threads)
    start_clstr = time.perf_counter()

    clusters = f.clusters(streets_std)
    count = len(clusters)
    end_clstr = time.perf_counter()
    exec_time = end_clstr - start_clstr
    print('Cluster time (using {} threads) {:0.4f} sec. ({} clusters)'.format(threads, exec_time, count))

# streets = ds\
#     .select('Street')\
#     .update('Street', StandardizeUSStreetName(characters='upper', alphanum=True, repeated=False))

# clusters = streets.cluster(clusterer=KeyCollision(func=Fingerprint()))

In [None]:
print_k_clusters(clusters, k=5)

Total number of clusters is 4 with 8 values

Cluster 1
  BEDFORD PARK BLVD EAST (x 2)
  EAST BEDFORD PARK BLVD (x 1)

Cluster 2
  EAST CLARKE PLACE (x 1)
  CLARKE PLACE EAST (x 1)

Cluster 3
  ST LAWRENCE (x 1)
  LAWRENCE ST (x 3)

Cluster 4
  ST NICHOLAS (x 1)
  NICHOLAS ST (x 1)


In [None]:
#Replacing similar street addresses
df_full['Street'].replace("EAST BEDFORD PARK BLVD", "BEDFORD PARK BLVD EAST", inplace=True)
df_full['Street'].replace("CLARKE PLACE EAST", "EAST CLARKE PLACE", inplace=True)
df_full['Street'].replace("LAWRENCE ST", "ST LAWRENCE", inplace=True)
df_full['Street'].replace("NICHOLAS ST", "ST NICHOLAS", inplace=True)

# Saving the Clean Data

In [None]:
df_full.to_csv(r'./resultCols.csv', index = False)

In [None]:
x = df_full.drop_duplicates()
print(x.shape)
print(df_full.shape)

(2428524, 49)
(2428526, 49)
