<a href="https://colab.research.google.com/github/sid2305/BigDataProject/blob/main/DOB_Housing_Permit_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing all the packages


In [None]:
# !pip install openclean
# !pip install humanfriendly
# !pip install openclean-geo

Collecting openclean
  Downloading openclean-0.2.1-py3-none-any.whl (5.2 kB)
Collecting openclean-core==0.4.1
  Downloading openclean_core-0.4.1-py3-none-any.whl (267 kB)
[K     |████████████████████████████████| 267 kB 5.4 MB/s 
[?25hCollecting histore>=0.4.0
  Downloading histore-0.4.1-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 53.5 MB/s 
Collecting flowserv-core>=0.8.0
  Downloading flowserv_core-0.9.2-py3-none-any.whl (260 kB)
[K     |████████████████████████████████| 260 kB 43.6 MB/s 
[?25hCollecting refdata>=0.2.0
  Downloading refdata-0.2.0-py3-none-any.whl (37 kB)
Collecting jsonschema>=3.2.0
  Downloading jsonschema-4.2.1-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 6.8 MB/s 
Collecting jellyfish
  Downloading jellyfish-0.8.9.tar.gz (137 kB)
[K     |████████████████████████████████| 137 kB 55.5 MB/s 
[?25hCollecting paramiko
  Downloading paramiko-2.8.1-py2.py3-none-any.whl (206 kB)
[K     |██████████████████

Importing packages

In [None]:
from openclean.data.source.socrata import Socrata
from openclean.pipeline import stream
from openclean.function.eval.base import Col
from openclean.function.eval.logic import And
from openclean.function.eval.null import IsNotEmpty
from openclean.operator.map.violations import fd_violations
from openclean.function.eval.mapping import Lookup
from openclean.operator.transform.update import Update
from openclean.operator.transform.update import update
from openclean.data.mapping import Mapping
from openclean.data.mapping import StringMatch
from openclean.cluster.key import key_collision
from openclean.data.refdata import RefStore
from openclean.function.value.null import is_empty
from pprint import pprint
import datetime
import gzip
import humanfriendly
import os
import time
import re

Loading Datasets

In [None]:
dataset = Socrata().dataset('bty7-2jhb')

datafile = './bty7-2jhb.tsv.gz'


# Download file only if it does not exist already.
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)


fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))
ds_full = stream(datafile)

Using 'Historical DOB Permit Issuance' in file ./bty7-2jhb.tsv.gz of size 321.34 MB


In [83]:
# We can choose whether you want to run the code for small sample or the full dataset
#Uncomment this to following result to run on a small subset
ds_sub = ds_full.sample(100, 0)
# ds_sub = ds_full
#make a copy of the subset of the dataset before cleaning
if not os.path.exists(dataset.name): 
  os.makedirs(dataset.name)
initalDF = ds_sub.to_df()
initalDF.to_csv("./"+dataset.name+"/initialDataset.csv")

Functions

In [71]:
#Violations of Functional Dependencies
#Latitude, Longitude -> Borough
#In this data when we examine this property, there are multiple borough's for same latitude and longitude.
def fd_borough(borough,latitude,longitude,ds_sub):
  data = ds_sub.select([borough,latitude,longitude]).where(And(IsNotEmpty(latitude), IsNotEmpty(longitude), IsNotEmpty(borough))).update(borough, str.upper)
  df = data\
      .select([borough,latitude,longitude])\
      .to_df()

  groups = fd_violations(df, lhs=[latitude,longitude], rhs=borough)
  group_mapping = dict()
  fixes = Mapping()
  for key in groups.keys():
      values = groups.values(key=key, columns=borough)
      max=0
      maxBorogh=""
      for v in values:
        if max<values[v]:
          max=values[v]
          maxBorogh=v
      group_mapping[key] = maxBorogh
  ds_sub=ds_sub.update([borough, latitude,longitude], lambda x,y,z: [group_mapping[(y,z)],y,z] if (y,z) in group_mapping else [x,y,z])
  return ds_sub

#The main idea of key collision methods is to create an alternative representation for each value (i.e., a key), and then group values based on their keys. So, here we group street names that are similar
def cleanstreet(street,ds_sub): 
  streets = ds_sub.select(street).distinct()
  clusters = key_collision(values=streets, minsize=2, threads=4)
  group_mapping = dict()
  def print_cluster(cnumber, cluster):
      for val, count in cluster.items():
          group_mapping[val]=cluster.suggestion()
  clusters.sort(key=lambda c: len(c), reverse=True)
  for i in range(len(clusters)):
      print_cluster(i + 1, clusters[i])
  ds_sub=ds_sub.update(street,lambda x: group_mapping[x] if x in group_mapping else x)
  return ds_sub

#Owner’s House State Column has 57 distinct values. But in the United States there are only 50 states. So, we find those which are not in the reference dataset and remove those
def cleanstate(state,ds_sub): 
  states = ds_sub.select(state).distinct()
  group_mapping = dict()
  refdata = RefStore()
  refdata\
      .load('nyc.gov:dof:state_codes', auto_download=True)\
      .df()\
      .head()
  states_ref = refdata.load('nyc.gov:dof:state_codes', auto_download=True).distinct('code')
  for rank, val in enumerate(states.most_common()):
      st, freq = val
      if st not in states_ref and st!="":
          print(f'{rank + 1:<3} {st}  {freq:>10,}')
          group_mapping[st]=""
  ds_sub=ds_sub.update(state,lambda x: group_mapping[x] if x in group_mapping else x)
  return ds_sub

# removing columns which contains more than 70% null values
def removecols(ds_sub):
  col_list = []
  reqColsInd = [ds_sub.columns.index(reqCol) for reqCol in ds_sub.columns]
  for i in reqColsInd:
    if (profiles[i]["stats"]["emptyValueCount"]/profiles[i]["stats"]["totalValueCount"])*100>70:
      col_list.append(profiles[i]["column"]) 
  cols = [col for col in ds_sub.columns if col not in col_list]
  ds_sub = ds_sub.select(cols)
  return ds_sub

#clean number content and checking for regular expression
def cleannumber(number):
    number = number.upper()
    if number in null_values: 
      return "N/A"
    number = re.sub("\D", "", number)
    return "N/A" if is_empty(number) else number

#Our date column contains time which is not necessary. So, we remove those
def remove_time(dt):
  if is_empty(dt):
    return 'N/A'
  final = dt
  if 'T' in final:
    final = str(datetime.datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S').date())
  return final
def clean_date(date,ds1):
  ds1 = ds1.update(date, lambda dt1: remove_time(dt1))
  return ds1

#verify given jobtypes from data dictionary and filling empty values wiht N/A
def clean_job_type(jobType):
  jobType = str(jobType)
  if jobType in ["A1","A2","A3","NB","DM","SG"]:
    return jobType
  else:
    return "N/A"

null_values = ["NA", "N.A", "N.A."]
# check to keep only digits and the length is ten digits
def cleanPhone(phone):
  phone = re.sub("\D", "", phone)
  if is_empty(phone) or len(phone) != 10:
      return "N/A"
  else:
      return phone

# Remove all characters except alphabets
def cleanName(name):
    name = name.upper()
    name = re.sub(r".*\.", "", name)
    name = re.sub("[^A-Z ]", "", name)
    name = re.sub(" +", " ", name)
    if name in null_values or is_empty(name): 
      return "N/A"
    return name

#replacing null values with N/A

def cleanEmptyValues(value):
    return "N/A" if value in null_values or is_empty(value) else value

#check the size of the data if it is equal to 3 else put N/A
def community_board_data(data):
    is_valid = True
    if not is_empty(data) and data.isnumeric() and len(data) == 3:
        return data
    else:
        return 'N/A'

#replacing '?' values with N/A
def block_and_lot(block, lot):
  block = 'N/A' if block == '?' else block
  lot = 'N/A' if lot == '?' else lot
  return block,lot

#check for only two characters for permit type, permit subtype and work type
def cleanType(workType):
  workType = str(workType)
  if len(workType)==2:
    return workType
  else:
    return "N/A"

#Predefined values of Permit status
def cleanPermitStatus(pStatus):
  pStatus = pStatus.upper()
  if pStatus in ["IN PROCESS", "ISSUED", "RE-ISSSUED","REVOKED"]:
    return pStatus
  else:
    return "N/A"

#Predefined values of Filing status
def cleanFilingStatus(fStatus):
  fStatus = fStatus.upper()
  if fStatus in ["INITIAL","RENEWAL"]:
    return fStatus
  else:
    return "N/A"


Profile it

In [57]:
# Profile the resulting dataset view using the default data profiler.

from openclean.profiling.column import DefaultColumnProfiler
profiles = ds_sub.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()


Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,100,0,5,0.05,1.89136
BIN,100,0,99,0.99,6.623856
Number,100,0,95,0.95,6.543856
Street,100,0,88,0.88,6.360211
Job #,100,0,100,1.0,6.643856
Job doc. #,100,0,3,0.03,0.688276
Job Type,100,0,6,0.06,1.858274
Self_Cert,100,62,1,0.026316,0.0
Block,100,0,95,0.95,6.543856
Lot,100,0,55,0.55,5.337564


##Data Cleaning

In [60]:
ds_sub.head()

Unnamed: 0,BOROUGH,BIN,Number,Street,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,Community Board,Postcode,Bldg Type,Residential,Special District 1,Special District 2,Work Type,Permit Status,Filing Status,Permit Type,Permit Sequence #,Permit Subtype,Oil Gas,Site Fill,Filing Date,Issuance Date,Expiration Date,Job Start Date,Permittee's First Name,Permittee's Last Name,Permittee's Business Name,Permittee's Phone #,Permittee's License Type,Permittee's License #,Act as Superintendent,Permittee's Other Title,HIC License,Site Safety Mgr's First Name,Site Safety Mgr's Last Name,Site Safety Mgr Business Name,Superintendent First & Last Name,Superintendent Business Name,Owner's Business Type,Non-Profit,Owner's Business Name,Owner's First Name,Owner's Last Name,Owner's House #,Owner's House Street Name,Owner’s House City,Owner’s House State,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,Latitude,Longitude,Council District,Census Tract,BBL,NTA
1000922,MANHATTAN,1034202,1120,6 AVENUE,100766761,2,A2,,1259,1,105,10036,2.0,,,,MH,ISSUED,INITIAL,EW,1,MH,,NONE,1994-02-22T00:00:00,1994-02-22T00:00:00,1994-10-31T00:00:00,1994-02-22T00:00:00,JOHN,WHITE,STRUCTURE TONE,2124816100,,0.0,Y,GC,,,,,STRUCTURE TONE,STRUCTURE TONE,,,,,,,,,,,,2016-01-03T00:00:00,40.755651,-73.983516,4,96,1012590001,Midtown-Midtown South ...
792850,MANHATTAN,1048787,203,EAST 86TH STREET,120816439,2,A2,Y,1532,1,108,10028,2.0,,,,MH,ISSUED,INITIAL,EW,1,MH,,,2011-09-16T00:00:00,2011-09-16T00:00:00,2012-09-01T00:00:00,2011-09-16T00:00:00,JEFFREY,WEINHAUS,ECLIPSE DEVELOPMENT INC,2126770180,GENERAL CONTRACTOR,19786.0,,,,,,,,,CORPORATION,,TIAA-CREF C/O THE COLORADO,JAMES,HAYDEN,201,EAST 86TH STREET,NEW YORK,NY,10028.0,2129870723.0,2016-01-03T00:00:00,40.778674,-73.953613,5,14602,1015320001,Yorkville ...
29737,BRONX,2022552,845,WHITE PLAINS ROAD,200829103,1,A2,Y,3645,1,209,10473,2.0,,,,OT,ISSUED,INITIAL,EW,1,OT,,NONE,2004-02-19T00:00:00,2004-02-23T00:00:00,2004-04-01T00:00:00,2004-02-23T00:00:00,MYTUNG-YEE,,BEST CONTRACTOR,7184061255,GENERAL CONTRACTOR,15194.0,Y,,,,,,BEST CONTRACTOR,BEST CONTRACTOR,INDIVIDUAL,,,BENJANIM,MAXIMOV,845,WHITE PLAINS RD.,BRONX,NY,10473.0,7182759555.0,2016-01-03T00:00:00,40.823992,-73.85875,18,42,2036450001,Soundview-Castle Hill-Clason Point-Harding Par...
408400,BROOKLYN,3117777,2513,TILDEN AVENUE,320206040,1,NB,,5128,34,317,11226,2.0,,,,,ISSUED,INITIAL,NB,1,,,USE UNDER 300 CU.YD,2012-03-02T00:00:00,2012-03-02T00:00:00,2013-01-01T00:00:00,2012-03-02T00:00:00,CHRIS,TSETSEKAS,ARTEC CONST & DEVEL CORP,7187212400,GENERAL CONTRACTOR,23325.0,,,,THOMAS,BRENNAN,,,,CORPORATION,Y,"THE BRIDGE, INC.",DR. PETER,BEITCHMAN,248,WEST 108TH STREET,NEW YORK,NY,100253181.0,2126633000.0,2016-01-03T00:00:00,40.646709,-73.953788,40,794,3051280034,Erasmus ...
152072,BRONX,2075643,1463,OHM AVENUE,200664502,1,DM,,5410,274,210,10465,,,,,,ISSUED,INITIAL,DM,1,,,OFF-SITE,2001-08-08T00:00:00,2001-08-08T00:00:00,2001-08-23T00:00:00,2001-08-08T00:00:00,CLIFF,FERRANDI,C.J.B. EQUIPMENT RENTAL CORP.,7188484250,DEMOLITION CONTRACTOR,,Y,,,,,,C.J.B. EQUIPMENT RENTAL CORP.,C.J.B. EQUIPMENT RENTAL CORP.,INDIVIDUAL,,,MARIO,MARCIANO,1463,OHM AVENUE,BRONX,NY,10462.0,7185979544.0,2016-01-03T00:00:00,40.843298,-73.821389,13,27401,2054100274,Pelham Bay-Country Club-City Island ...
638036,BROOKLYN,3028291,667,CLASSON AVENUE,300457583,2,A1,,1149,11,308,11238,2.0,,,,EQ,ISSUED,INITIAL,EQ,1,FN,,NONE,1996-03-27T00:00:00,1996-03-27T00:00:00,1996-11-01T00:00:00,1996-03-27T00:00:00,JEFF,LEVINE,J.E.LEVINE BUILDER,7182247147,,,Y,G/C,,,,,J.E.LEVINE BUILDER,J.E.LEVINE BUILDER,,,,,,,,,,,,2016-01-03T00:00:00,40.677381,-73.95916,35,305,3011490011,Crown Heights North ...
2033870,QUEENS,4539393,141-05,CHERRY AVENUE,402207768,1,NB,,5186,71,407,11355,,YES,,,,ISSUED,RENEWAL,NB,2,,,NONE,2006-09-19T00:00:00,2006-09-19T00:00:00,2006-10-14T00:00:00,2006-09-08T00:00:00,JOHNNY,CHAN,UNIVERSAL CONSTR CONS INCNSULTAN,2129667828,GENERAL CONTRACTOR,34126.0,Y,,,,,,UNIVERSAL CONSTRUCTION CONSULTAN,UNIVERSAL CONSTRUCTION CONSULTAN,CORPORATION,,"ASIAN PACIFIC, LLC",LIN,LAI,141-05,CHERRY AVE,FLUSHING,NC,11355.0,7182793618.0,2016-01-03T00:00:00,40.754707,-73.82155,20,859,4051867505,Flushing ...
1361098,MANHATTAN,1080824,1133,AVENUE OF AMERICAS,104282562,1,A3,Y,996,29,105,10036,2.0,,,,EQ,ISSUED,RENEWAL,EQ,2,SH,,NONE,2006-04-06T00:00:00,2006-04-06T00:00:00,2007-04-01T00:00:00,2005-11-15T00:00:00,JERIMIAH,HARRINGTON,ROCKLEDGE SCAFFOLD,9144230400,GENERAL CONTRACTOR,1982.0,Y,,,,,,ROCKLEDGE SCAFFOLD,ROCKLEDGE SCAFFOLD,PARTNERSHIP,,Dolp1133 Properties LLC,Louis,Esposito,1155,6th Ave,New York,NY,10036.0,2127891155.0,2016-01-03T00:00:00,40.755942,-73.983324,4,119,1009960029,Midtown-Midtown South ...
564802,BROOKLYN,3142630,1429,67 STREET,301655000,1,NB,,5762,64,311,11219,1.0,YES,,,,ISSUED,INITIAL,NB,1,,,ON-SITE,2003-12-24T00:00:00,2004-01-13T00:00:00,2004-04-06T00:00:00,2004-01-13T00:00:00,AGOSTINO,OLIVIERI,CATERINA LLC,7182322610,GENERAL CONTRACTOR,21907.0,Y,,,,,,CATERINA LLC,CATERINA LLC,PARTNERSHIP,,Caterina LLC,Agostino,Olivieri,6602,19th Avenue,Brooklyn,NY,11204.0,7182322610.0,2016-01-03T00:00:00,40.62303,-73.999755,38,190,3057627503,Bensonhurst West ...
954186,MANHATTAN,1047627,140,EAST 83 STREET,104828000,1,A2,,1511,53,108,10028,2.0,,,,PL,ISSUED,RENEWAL,PL,2,,,NONE,2008-09-04T00:00:00,2008-09-04T00:00:00,2009-09-04T00:00:00,2007-08-20T00:00:00,BILLY,STAVRIANPOULOS,GO PRO PLG AND HTG CO INC,7182204400,MASTER PLUMBER,1608.0,,,,,,,,,CORPORATION,,140 E 83RD ST TENANT CORP,STEVEN,SHAPIRO,666,LEXINGTON AVE,MT KISCO,NY,10549.0,9146666860.0,2016-01-03T00:00:00,40.777421,-73.956734,5,140,1015110053,Upper East Side-Carnegie Hill ...


###Remove Columns with more than 70% of null values

In [61]:
ds_sub = removecols(ds_sub)

###Applying cleaning techniques according to the columns

In [69]:
for col in ds_sub.columns:
  if col in ["Owner's House Street Name","Street"]:
    ds_sub=cleanstreet(col,ds_sub)
  elif col in ["Owner’s House State"]:
    ds_sub=cleanstate(col,ds_sub)
  elif col in ['Filing Date', 'Issuance Date', 'Expiration Date','Job Start Date','DOBRunDate']:
    ds_sub=clean_date(col,ds_sub)
  elif col in ['Number']:
    ds_sub = ds_sub.update(num_col, lambda x: cleannumber(x))
  elif col in ["Owner's Phone #", "Permittee's Phone #"]:
    ds_sub = ds_sub.update(col, lambda a: cleanPhone(a))
  elif col in ["Permittee's First Name","Permittee's Last Name","Owner's First Name","Owner's Last Name"]:
    ds_sub = ds_sub.update(col, lambda a: cleanPhone(a))
  elif col in ["Permit Type", "Permit Subtype" and "Work Type"]:
    ds_sub = ds_sub.update(col, lambda a: cleanType(a))
  elif col in ["Permit Status"]:
    ds_sub = ds_sub.update(col, lambda a: cleanPermitStatus(a))
  elif col in ["Filing Status"]:
    ds_sub = ds_sub.update(col, lambda a: cleanFilingStatus(a))
  else:
    ds_sub = ds_sub.update(col, lambda a: cleanEmptyValues(a))

# cleaning as functional dependency violation between borough, latitude and longitude 
ds_sub=fd_borough('BOROUGH','Latitude','Longitude',ds_sub)
print("Table updated")

Table updated


In [63]:
ds_sub.head()

Unnamed: 0,BOROUGH,BIN,Number,Street,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,Community Board,Postcode,Bldg Type,Work Type,Permit Status,Filing Status,Permit Type,Permit Sequence #,Permit Subtype,Site Fill,Filing Date,Issuance Date,Expiration Date,Job Start Date,Permittee's First Name,Permittee's Last Name,Permittee's Business Name,Permittee's Phone #,Permittee's License Type,Permittee's License #,Act as Superintendent,Superintendent First & Last Name,Superintendent Business Name,Owner's Business Type,Owner's Business Name,Owner's First Name,Owner's Last Name,Owner's House #,Owner's House Street Name,Owner’s House City,Owner’s House State,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,Latitude,Longitude,Council District,Census Tract,BBL,NTA
1000922,MANHATTAN,1034202,1120,6 AVENUE,100766761,2,A2,,1259,1,105,10036,2.0,MH,ISSUED,INITIAL,EW,1,MH,,1994-02-22,1994-02-22,1994-10-31,1994-02-22,,,STRUCTURE TONE,2124816100,,0.0,Y,STRUCTURE TONE,STRUCTURE TONE,,,,,,,,,,,2016-01-03,40.755651,-73.983516,4,96,1012590001,MIDTOWN-MIDTOWN SOUTH ...
792850,MANHATTAN,1048787,203,EAST 86TH STREET,120816439,2,A2,Y,1532,1,108,10028,2.0,MH,ISSUED,INITIAL,EW,1,MH,,2011-09-16,2011-09-16,2012-09-01,2011-09-16,,,ECLIPSE DEVELOPMENT INC,2126770180,GENERAL CONTRACTOR,19786.0,,,,CORPORATION,TIAA-CREF C/O THE COLORADO,,,201,EAST 86TH STREET,NEW YORK,NY,10028.0,2129870723.0,2016-01-03,40.778674,-73.953613,5,14602,1015320001,YORKVILLE ...
29737,BRONX,2022552,845,WHITE PLAINS ROAD,200829103,1,A2,Y,3645,1,209,10473,2.0,OT,ISSUED,INITIAL,EW,1,OT,,2004-02-19,2004-02-23,2004-04-01,2004-02-23,,,BEST CONTRACTOR,7184061255,GENERAL CONTRACTOR,15194.0,Y,BEST CONTRACTOR,BEST CONTRACTOR,INDIVIDUAL,,,,845,WHITE PLAINS RD.,BRONX,NY,10473.0,7182759555.0,2016-01-03,40.823992,-73.85875,18,42,2036450001,SOUNDVIEW-CASTLE HILL-CLASON POINT-HARDING PAR...
408400,BROOKLYN,3117777,2513,TILDEN AVENUE,320206040,1,NB,,5128,34,317,11226,2.0,,ISSUED,INITIAL,NB,1,,USE UNDER 300 CU.YD,2012-03-02,2012-03-02,2013-01-01,2012-03-02,,,ARTEC CONST & DEVEL CORP,7187212400,GENERAL CONTRACTOR,23325.0,,,,CORPORATION,"THE BRIDGE, INC.",,,248,WEST 108TH STREET,NEW YORK,NY,100253181.0,2126633000.0,2016-01-03,40.646709,-73.953788,40,794,3051280034,ERASMUS ...
152072,BRONX,2075643,1463,OHM AVENUE,200664502,1,DM,,5410,274,210,10465,,,ISSUED,INITIAL,DM,1,,OFF-SITE,2001-08-08,2001-08-08,2001-08-23,2001-08-08,,,C.J.B. EQUIPMENT RENTAL CORP.,7188484250,DEMOLITION CONTRACTOR,,Y,C.J.B. EQUIPMENT RENTAL CORP.,C.J.B. EQUIPMENT RENTAL CORP.,INDIVIDUAL,,,,1463,OHM AVENUE,BRONX,NY,10462.0,7185979544.0,2016-01-03,40.843298,-73.821389,13,27401,2054100274,PELHAM BAY-COUNTRY CLUB-CITY ISLAND ...
638036,BROOKLYN,3028291,667,CLASSON AVENUE,300457583,2,A1,,1149,11,308,11238,2.0,EQ,ISSUED,INITIAL,EQ,1,FN,,1996-03-27,1996-03-27,1996-11-01,1996-03-27,,,J.E.LEVINE BUILDER,7182247147,,,Y,J.E.LEVINE BUILDER,J.E.LEVINE BUILDER,,,,,,,,,,,2016-01-03,40.677381,-73.95916,35,305,3011490011,CROWN HEIGHTS NORTH ...
2033870,QUEENS,4539393,14105,CHERRY AVENUE,402207768,1,NB,,5186,71,407,11355,,,ISSUED,RENEWAL,NB,2,,,2006-09-19,2006-09-19,2006-10-14,2006-09-08,,,UNIVERSAL CONSTR CONS INCNSULTAN,2129667828,GENERAL CONTRACTOR,34126.0,Y,UNIVERSAL CONSTRUCTION CONSULTAN,UNIVERSAL CONSTRUCTION CONSULTAN,CORPORATION,"ASIAN PACIFIC, LLC",,,141-05,CHERRY AVE,FLUSHING,NC,11355.0,7182793618.0,2016-01-03,40.754707,-73.82155,20,859,4051867505,FLUSHING ...
1361098,MANHATTAN,1080824,1133,AVENUE OF AMERICAS,104282562,1,A3,Y,996,29,105,10036,2.0,EQ,ISSUED,RENEWAL,EQ,2,SH,,2006-04-06,2006-04-06,2007-04-01,2005-11-15,,,ROCKLEDGE SCAFFOLD,9144230400,GENERAL CONTRACTOR,1982.0,Y,ROCKLEDGE SCAFFOLD,ROCKLEDGE SCAFFOLD,PARTNERSHIP,DOLP1133 PROPERTIES LLC,,,1155,6th Ave,NEW YORK,NY,10036.0,2127891155.0,2016-01-03,40.755942,-73.983324,4,119,1009960029,MIDTOWN-MIDTOWN SOUTH ...
564802,BROOKLYN,3142630,1429,67 STREET,301655000,1,NB,,5762,64,311,11219,1.0,,ISSUED,INITIAL,NB,1,,ON-SITE,2003-12-24,2004-01-13,2004-04-06,2004-01-13,,,CATERINA LLC,7182322610,GENERAL CONTRACTOR,21907.0,Y,CATERINA LLC,CATERINA LLC,PARTNERSHIP,CATERINA LLC,,,6602,19th Avenue,BROOKLYN,NY,11204.0,7182322610.0,2016-01-03,40.62303,-73.999755,38,190,3057627503,BENSONHURST WEST ...
954186,MANHATTAN,1047627,140,EAST 83 STREET,104828000,1,A2,,1511,53,108,10028,2.0,PL,ISSUED,RENEWAL,PL,2,,,2008-09-04,2008-09-04,2009-09-04,2007-08-20,,,GO PRO PLG AND HTG CO INC,7182204400,MASTER PLUMBER,1608.0,,,,CORPORATION,140 E 83RD ST TENANT CORP,,,666,LEXINGTON AVE,MT KISCO,NY,10549.0,9146666860.0,2016-01-03,40.777421,-73.956734,5,140,1015110053,UPPER EAST SIDE-CARNEGIE HILL ...


In [64]:
from openclean.profiling.column import DefaultColumnProfiler

profiles = ds_sub.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,100,0,5,0.05,1.89136
BIN,100,0,99,0.99,6.623856
Number,100,0,95,0.95,6.543856
Street,100,0,87,0.87,6.340211
Job #,100,0,100,1.0,6.643856
Job doc. #,100,0,3,0.03,0.688276
Job Type,100,0,6,0.06,1.858274
Self_Cert,100,0,2,0.02,0.958042
Block,100,0,95,0.95,6.543856
Lot,100,0,55,0.55,5.337564


In [84]:
if not os.path.exists(dataset.name): 
  os.makedirs(dataset.name)
finalDF = ds_sub.to_df()
finalDF.to_csv("./"+dataset.name+"/final.csv")