<a href="https://colab.research.google.com/github/sid2305/BigDataProject/blob/main/DOB_Housing_Permit_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing all the packages


In [1]:
!pip install openclean
!pip install humanfriendly
!pip install openclean-geo

Collecting openclean
  Downloading openclean-0.2.1-py3-none-any.whl (5.2 kB)
Collecting openclean-core==0.4.1
  Downloading openclean_core-0.4.1-py3-none-any.whl (267 kB)
[K     |████████████████████████████████| 267 kB 10.8 MB/s 
Collecting flowserv-core>=0.8.0
  Downloading flowserv_core-0.9.2-py3-none-any.whl (260 kB)
[K     |████████████████████████████████| 260 kB 58.5 MB/s 
[?25hCollecting refdata>=0.2.0
  Downloading refdata-0.2.0-py3-none-any.whl (37 kB)
Collecting jellyfish
  Downloading jellyfish-0.8.9.tar.gz (137 kB)
[K     |████████████████████████████████| 137 kB 51.5 MB/s 
[?25hCollecting jsonschema>=3.2.0
  Downloading jsonschema-4.2.1-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 6.3 MB/s 
Collecting histore>=0.4.0
  Downloading histore-0.4.1-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 85.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_1

Importing packages

In [2]:
from openclean.data.source.socrata import Socrata
from openclean.pipeline import stream
from openclean.function.eval.base import Col
from openclean.function.eval.logic import And
from openclean.function.eval.null import IsNotEmpty, IsEmpty
from openclean.operator.map.violations import fd_violations
from openclean.function.eval.mapping import Lookup
from openclean.operator.transform.update import Update
from openclean.operator.transform.update import update
from openclean.data.mapping import Mapping
from openclean.data.mapping import StringMatch
from openclean.cluster.key import key_collision
from openclean.data.refdata import RefStore
from openclean.function.value.null import is_empty
from openclean.function.matching.base import DefaultStringMatcher
from openclean.function.matching.fuzzy import FuzzySimilarity
from openclean_geo.address.usstreet import StandardizeUSStreetName
from openclean.data.load import dataset
from pprint import pprint
import datetime
import gzip
import humanfriendly
import os
import time
import re

Loading Datasets

In [3]:
_dataset = Socrata().dataset('bty7-2jhb')

datafile = './bty7-2jhb.tsv.gz'


# Download file only if it does not exist already.
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        _dataset.write(f)


fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(_dataset.name, datafile, fsize))
ds_full = stream(datafile)

Downloading ...

Using 'Historical DOB Permit Issuance' in file ./bty7-2jhb.tsv.gz of size 321.34 MB


In [8]:
# We can choose whether you want to run the code for small sample or the full dataset
#Uncomment this to following result to run on a small subset
ds_sub = ds_full.sample(100, 0)
#ds_sub = ds_full
#make a copy of the subset of the dataset before cleaning
if not os.path.exists(_dataset.name): 
  os.makedirs(_dataset.name)
initalDF = ds_sub.to_df()
initalDF.to_csv("./"+_dataset.name+"/initialDataset.csv")

Functions

In [4]:
#Violations of Functional Dependencies
#Latitude, Longitude -> Borough
#In this data when we examine this property, there are multiple borough's for same latitude and longitude.
def fd_borough(borough,latitude,longitude,ds_sub):
  data = ds_sub.select([borough,latitude,longitude]).where(And(IsNotEmpty(latitude), IsNotEmpty(longitude), IsNotEmpty(borough))).update(borough, str.upper)
  df = data\
      .select([borough,latitude,longitude])\
      .to_df()

  groups = fd_violations(df, lhs=[latitude,longitude], rhs=borough)
  group_mapping = dict()
  fixes = Mapping()
  for key in groups.keys():
      values = groups.values(key=key, columns=borough)
      max=0
      maxBorogh=""
      for v in values:
        if max<values[v]:
          max=values[v]
          maxBorogh=v
      group_mapping[key] = maxBorogh
  ds_sub=ds_sub.update([borough, latitude,longitude], lambda x,y,z: [group_mapping[(y,z)],y,z] if (y,z) in group_mapping else [x,y,z])
  return ds_sub

def fd_borough_house_street_postcode(borough,house,street,postcode,ds_sub):
  data = ds_sub.select([borough, house, street, postcode]).where(And(IsNotEmpty(borough), IsNotEmpty(house), IsNotEmpty(street))).update(street, str.upper).update(borough, str.upper)
  df = data\
    .select([borough, house, street, postcode])\
    .to_df()

  groups = fd_violations(df, lhs=[borough, house, street], rhs=postcode)
  group_mapping = dict()
  for key in groups.keys():
    values = groups.values(key=key, columns='Postcode')
    max=0
    maxValue=""
    for v in values:
      if v is None or len(v) == 0:
        continue
      if max<values[v]:
        max=values[v]
        maxValue=v
    group_mapping[key] = maxValue
  return ds_sub.update(postcode, Lookup(columns=[borough, house, street], mapping=group_mapping, default=Col(postcode)))

#The main idea of key collision methods is to create an alternative representation for each value (i.e., a key), and then group values based on their keys. So, here we group street names that are similar
# def cleanstreet(street,ds_sub): 
#   streets = ds_sub.select(street).distinct()
#   clusters = key_collision(values=streets, minsize=2, threads=4)
#   group_mapping = dict()
#   def print_cluster(cnumber, cluster):
#       for val, count in cluster.items():
#           group_mapping[val]=cluster.suggestion()
#   clusters.sort(key=lambda c: len(c), reverse=True)
#   for i in range(len(clusters)):
#       print_cluster(i + 1, clusters[i])
#   ds_sub=ds_sub.update(street,lambda x: group_mapping[x] if x in group_mapping else x)
#   return ds_sub

def cleanStreet(street):
    if is_empty(street): 
      return "N/A"
    # Standardize the street names
    streetStandFunction = StandardizeUSStreetName(characters='upper', alphanum=True, repeated=False)
    street = ''.join(streetStandFunction.apply([street], threads=None))
    return street

#Owner’s House State Column has 57 distinct values. But in the United States there are only 50 states. So, we find those which are not in the reference dataset and remove those
def cleanstate(state,ds_sub): 
  states = ds_sub.select(state).distinct()
  group_mapping = dict()
  refdata = RefStore()
  refdata\
      .load('nyc.gov:dof:state_codes', auto_download=True)\
      .df()\
      .head()
  states_ref = refdata.load('nyc.gov:dof:state_codes', auto_download=True).distinct('code')
  for rank, val in enumerate(states.most_common()):
      st, freq = val
      if st not in states_ref and st!="":
          print(f'{rank + 1:<3} {st}  {freq:>10,}')
          group_mapping[st]=""
  ds_sub=ds_sub.update(state,lambda x: group_mapping[x] if x in group_mapping else x)
  return ds_sub

# removing columns which contains more than 70% null values
def removecols(ds_sub):
  col_list = []
  reqColsInd = [ds_sub.columns.index(reqCol) for reqCol in ds_sub.columns]
  for i in reqColsInd:
    if (profiles[i]["stats"]["emptyValueCount"]/profiles[i]["stats"]["totalValueCount"])*100>70:
      col_list.append(profiles[i]["column"]) 
  cols = [col for col in ds_sub.columns if col not in col_list]
  ds_sub = ds_sub.select(cols)
  return ds_sub

#clean number content and checking for regular expression
def cleannumber(number):
    number = number.upper()
    if number in null_values: 
      return "N/A"
    number = re.sub("\D", "", number)
    return "N/A" if is_empty(number) else number

#Our date column contains time which is not necessary. So, we remove those
def remove_time(dt):
  if is_empty(dt):
    return 'N/A'
  final = dt
  if 'T' in final:
    final = str(datetime.datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S').date())
  return final
def clean_date(date,ds1):
  ds1 = ds1.update(date, lambda dt1: remove_time(dt1))
  return ds1

#verify given jobtypes from data dictionary and filling empty values wiht N/A
def clean_job_type(jobType):
  jobType = str(jobType)
  if jobType in ["A1","A2","A3","NB","DM","SG"]:
    return jobType
  else:
    return "N/A"

null_values = ["NA", "N.A", "N.A."]
# check to keep only digits and the length is ten digits
def cleanPhone(phone):
  phone = re.sub("\D", "", phone)
  if is_empty(phone) or len(phone) != 10:
      return "N/A"
  else:
      return phone

# Remove all characters except alphabets
def cleanName(name):
    name = name.upper()
    name = re.sub(r".*\.", "", name)
    name = re.sub("[^A-Z ]", "", name)
    name = re.sub(" +", " ", name)
    if name in null_values or is_empty(name): 
      return "N/A"
    return name

#replacing null values with N/A

def cleanEmptyValues(value):
    return "N/A" if value in null_values or is_empty(value) else value

#check the size of the data if it is equal to 3 else put N/A
def community_board_data(data):
    is_valid = True
    if not is_empty(data) and data.isnumeric() and len(data) == 3:
        return data
    else:
        return 'N/A'

#replacing '?' values with N/A
def block_and_lot(block, lot):
  block = 'N/A' if block == '?' else block
  lot = 'N/A' if lot == '?' else lot
  return block,lot

#check for only two characters for permit type, permit subtype and work type
def cleanType(workType):
  workType = str(workType)
  if len(workType)==2:
    return workType
  else:
    return "N/A"

#Predefined values of Permit status
def cleanPermitStatus(pStatus):
  pStatus = pStatus.upper()
  if pStatus in ["IN PROCESS", "ISSUED", "RE-ISSSUED","REVOKED"]:
    return pStatus
  else:
    return "N/A"

#Predefined values of Filing status
 
def cleanFilingStatus(fStatus):
  fStatus = fStatus.upper()
  if fStatus in ["INITIAL","RENEWAL","APPROVED", "CANCELLED", "COMPLETE", "ONHOLD-NOGOODCHECK", "PENDING PAYMENT", "PERMIT ISSUED", "WITHDRAWN"]:
    return fStatus
  else:
    return "N/A"


Profile it

In [None]:
# Profile the resulting dataset view using the default data profiler.

from openclean.profiling.column import DefaultColumnProfiler
profiles = ds_sub.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()




Unnamed: 0,total,empty,distinct,uniqueness,entropy
BOROUGH,2428526,0,5,2.058862e-06,2.045161
BIN,2428526,0,300024,0.1235416,16.764061
Number,2428526,4,28639,0.01179277,11.933581
Street,2428526,4,20201,0.008318228,11.223448
Job #,2428526,0,1110544,0.4572914,19.723021
Job doc. #,2428526,0,12,4.941269e-06,0.496506
Job Type,2428526,0,6,2.470634e-06,1.855338
Self_Cert,2428526,1527841,1,1.110266e-06,0.0
Block,2428526,498,13625,0.00561155,12.54555
Lot,2428526,507,1718,0.0007075727,6.659702


##Data Cleaning

In [None]:
ds_sub.head()

Unnamed: 0,BOROUGH,BIN,Number,Street,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,Community Board,Postcode,Bldg Type,Residential,Special District 1,Special District 2,Work Type,Permit Status,Filing Status,Permit Type,Permit Sequence #,Permit Subtype,Oil Gas,Site Fill,Filing Date,Issuance Date,Expiration Date,Job Start Date,Permittee's First Name,Permittee's Last Name,Permittee's Business Name,Permittee's Phone #,Permittee's License Type,Permittee's License #,Act as Superintendent,Permittee's Other Title,HIC License,Site Safety Mgr's First Name,Site Safety Mgr's Last Name,Site Safety Mgr Business Name,Superintendent First & Last Name,Superintendent Business Name,Owner's Business Type,Non-Profit,Owner's Business Name,Owner's First Name,Owner's Last Name,Owner's House #,Owner's House Street Name,Owner’s House City,Owner’s House State,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,Latitude,Longitude,Council District,Census Tract,BBL,NTA
0,BRONX,2118801,2960,WEBSTER AVENUE,201088492,4,NB,,3274,4,207,10458,2,,,,PL,ISSUED,INITIAL,PL,1,,,,2010-11-05T00:00:00,2010-11-05T00:00:00,2011-11-05T00:00:00,2010-11-05T00:00:00,LAWRENCE,LEVINE,"PAR PLUMBING CO., INC",2129261088,MASTER PLUMBER,161,,,,,,,,,,,,,,,,,,,,2016-01-03T00:00:00,40.86749,-73.883225,11,425,2032740001,Norwood ...
1,BRONX,2096812,100,DEKRUIF PLACE,200716298,2,A2,,5141,120,209,10475,2,,,,EQ,ISSUED,RENEWAL,EQ,12,FN,,NONE,2012-01-30T00:00:00,2012-01-30T00:00:00,2013-01-29T00:00:00,2002-08-08T00:00:00,ANTHONY,RASULO,RIVERBAY CORP,7183203300,GENERAL CONTRACTOR,1962,,,,,,,,,,,,,,,,,,,,2016-01-03T00:00:00,40.875769,-73.828899,12,46201,2051410120,Co-op City ...
2,BRONX,2008604,1898,HARRISON AVENUE,200974650,2,A2,,2869,87,205,10453,2,,,,PL,ISSUED,RENEWAL,PL,3,,,NONE,2008-02-04T00:00:00,2008-02-04T00:00:00,2009-02-03T00:00:00,2005-08-29T00:00:00,OSCAR,JACKSON,PERFECT PLUMBING & HETING CORP,7185157055,MASTER PLUMBER,594,,,,,,,,,,,,,,,,,,,,2016-01-03T00:00:00,40.852603,-73.911461,14,243,2028690087,University Heights-Morris Heights ...
3,BRONX,2007652,1998,MORRIS AVENUE,200278118,2,A1,,2807,15,205,10453,1,,,,PL,ISSUED,INITIAL,PL,1,,,NONE,1998-08-31T00:00:00,1998-08-31T00:00:00,1999-08-31T00:00:00,1998-08-31T00:00:00,GERI,KAUUMBA,GOWIE PLUMBING,7188821281,MASTER PLUMBER,1137,Y,,,,,,GOWIE PLUMBING,GOWIE PLUMBING,,,,,,,,,,,,2016-01-03T00:00:00,40.851661,-73.906937,14,241,2028070015,Mount Hope ...
4,BRONX,2084155,565,WEST 235 STREET,201119173,2,A2,Y,5794,484,208,10463,2,,,,MH,ISSUED,INITIAL,EW,1,MH,,NONE,2007-04-30T00:00:00,2007-04-30T00:00:00,2008-01-08T00:00:00,2007-04-30T00:00:00,GARY,ZYSMAN,THE DU-RITE INC,2013877000,GENERAL CONTRACTOR,9872,Y,,,,,,THE DU-RITE INC,THE DU-RITE INC,,,,,,,,,,,,2016-01-03T00:00:00,40.88572,-73.91027,11,297,2057940484,Spuyten Duyvil-Kingsbridge ...
5,BRONX,2012264,606,EAST FORDHAM ROAD,200089251,2,A1,,3078,16,206,10458,2,,,,EQ,ISSUED,INITIAL,EQ,1,FN,,NONE,1994-04-22T00:00:00,1994-04-22T00:00:00,1995-04-22T00:00:00,1994-04-22T00:00:00,JOSE,VARGAS,VARGAS ASSOCIATES,9149692853,PROFESSIONAL ENGINEER,56795,Y,,,,,,VARGAS ASSOCIATES,VARGAS ASSOCIATES,,,,,,,,,,,,2016-01-03T00:00:00,40.858324,-73.884836,15,389,2030780016,Belmont ...
6,BRONX,2103486,730,CONCOURSE VILLAGE WEST,200896762,2,NB,Y,2443,78,204,10451,2,,,,PL,ISSUED,RENEWAL,PL,2,,,NONE,2008-06-05T00:00:00,2008-06-05T00:00:00,2009-06-05T00:00:00,2007-06-12T00:00:00,ROBERT,GOLDIN,WDF INCORPORATED,2126961124,MASTER PLUMBER,926,,,,,,,,,,,,,,,,,,,,2016-01-03T00:00:00,40.82231,-73.923829,17,61,2024430078,East Concourse-Concourse Village ...
7,BRONX,2000391,345,BROOK AVENUE,201015613,2,A2,Y,2286,36,201,10454,2,,,,PL,ISSUED,RENEWAL,PL,3,,,,2012-05-22T00:00:00,2012-05-22T00:00:00,2013-05-22T00:00:00,2008-10-31T00:00:00,VINCENT,GAMBA,OLYMPIC PLBG & HTG SVC IN,7185284001,MASTER PLUMBER,1580,,,,,,,,,,,,,,,,,,,,2016-01-03T00:00:00,40.81004,-73.917792,8,41,2022860036,Mott Haven-Port Morris ...
8,BRONX,2011594,4487,THIRD AVENUE,200348524,3,NB,Y,3051,45,206,10457,2,,,,PL,ISSUED,RENEWAL,PL,2,,,NONE,2008-04-02T00:00:00,2008-05-30T00:00:00,2009-05-30T00:00:00,2000-10-27T00:00:00,VICTOR,SMITH,PROGRAM UNLIMITED PLUMBING,7182397630,MASTER PLUMBER,1056,,,,,,,,,,,,,,,,,,,,2016-01-03T00:00:00,40.853237,-73.891742,15,385,2030510045,Claremont-Bathgate ...
9,BRONX,2001106,575,WALTON AVENUE,200920049,2,A1,,2352,43,204,10451,2,,,,PL,ISSUED,RENEWAL,PL,3,,,NONE,2007-10-10T00:00:00,2007-10-10T00:00:00,2008-10-09T00:00:00,2005-07-01T00:00:00,MARVIN,WASOFF,"WASOFF PLUMBING UTILIITY CO,.INC",7183871400,MASTER PLUMBER,868,,,,,,,,,,,,,,,,,,,,2016-01-03T00:00:00,40.819464,-73.92811,8,63,2023520043,West Concourse ...


###Remove Columns with more than 70% of null values

In [None]:
ds_sub = removecols(ds_sub)

###Applying cleaning techniques according to the columns

In [7]:
for col in ds_sub.columns:
  if col in ["Owner's House Street Name","Street"]:
    ds_sub=ds_sub.update(col, lambda a: cleanStreet(a))
  elif col in ["Owner’s House State"]:
    ds_sub=cleanstate(col,ds_sub)
  elif col in ['Filing Date', 'Issuance Date', 'Expiration Date','Job Start Date','DOBRunDate']:
    ds_sub=clean_date(col,ds_sub)
  elif col in ['Number']:
    ds_sub = ds_sub.update(col, lambda x: cleannumber(x))
  elif col in ["Owner's Phone #", "Permittee's Phone #"]:
    ds_sub = ds_sub.update(col, lambda a: cleanPhone(a))
  elif col in ["Permittee's First Name","Permittee's Last Name","Owner's First Name","Owner's Last Name"]:
    ds_sub = ds_sub.update(col, lambda a: cleanPhone(a))
  elif col in ["Permit Type", "Permit Subtype" and "Work Type"]:
    ds_sub = ds_sub.update(col, lambda a: cleanType(a))
  elif col in ["Permit Status"]:
    ds_sub = ds_sub.update(col, lambda a: cleanPermitStatus(a))
  elif col in ["Filing Status"]:
    ds_sub = ds_sub.update(col, lambda a: cleanFilingStatus(a))
  else:
    ds_sub = ds_sub.update(col, lambda a: cleanEmptyValues(a))

# cleaning as functional dependency violation between borough, latitude and longitude 
ds_sub=fd_borough('BOROUGH','Latitude','Longitude',ds_sub)
print("Table updated")

46  PR          12
52  CN           5
55  sw           2
56  FQ           1
57  ï¿½ï¿½           1
Table updated


In [None]:
ds_sub = fd_borough_house_street_postcode('BOROUGH', 'Number', 'Street', 'Postcode', ds_sub)
print("Table updated")

In [8]:
ds_sub.head()

Unnamed: 0,BOROUGH,BIN,Number,Street,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,Community Board,Postcode,Bldg Type,Residential,Special District 1,Special District 2,Work Type,Permit Status,Filing Status,Permit Type,Permit Sequence #,Permit Subtype,Oil Gas,Site Fill,Filing Date,Issuance Date,Expiration Date,Job Start Date,Permittee's First Name,Permittee's Last Name,Permittee's Business Name,Permittee's Phone #,Permittee's License Type,Permittee's License #,Act as Superintendent,Permittee's Other Title,HIC License,Site Safety Mgr's First Name,Site Safety Mgr's Last Name,Site Safety Mgr Business Name,Superintendent First & Last Name,Superintendent Business Name,Owner's Business Type,Non-Profit,Owner's Business Name,Owner's First Name,Owner's Last Name,Owner's House #,Owner's House Street Name,Owner’s House City,Owner’s House State,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,Latitude,Longitude,Council District,Census Tract,BBL,NTA
0,BRONX,2118801,2960,WEBSTER AVE,201088492,4,NB,,3274,4,207,10458,2,,,,PL,ISSUED,INITIAL,PL,1,,,,2010-11-05,2010-11-05,2011-11-05,2010-11-05,,,"PAR PLUMBING CO., INC",2129261088,MASTER PLUMBER,161,,,,,,,,,,,,,,,,,,,,2016-01-03,40.86749,-73.883225,11,425,2032740001,Norwood ...
1,BRONX,2096812,100,DEKRUIF PLACE,200716298,2,A2,,5141,120,209,10475,2,,,,EQ,ISSUED,RENEWAL,EQ,12,FN,,NONE,2012-01-30,2012-01-30,2013-01-29,2002-08-08,,,RIVERBAY CORP,7183203300,GENERAL CONTRACTOR,1962,,,,,,,,,,,,,,,,,,,,2016-01-03,40.875769,-73.828899,12,46201,2051410120,Co-op City ...
2,BRONX,2008604,1898,HARRISON AVE,200974650,2,A2,,2869,87,205,10453,2,,,,PL,ISSUED,RENEWAL,PL,3,,,NONE,2008-02-04,2008-02-04,2009-02-03,2005-08-29,,,PERFECT PLUMBING & HETING CORP,7185157055,MASTER PLUMBER,594,,,,,,,,,,,,,,,,,,,,2016-01-03,40.852603,-73.911461,14,243,2028690087,University Heights-Morris Heights ...
3,BRONX,2007652,1998,MORRIS AVE,200278118,2,A1,,2807,15,205,10453,1,,,,PL,ISSUED,INITIAL,PL,1,,,NONE,1998-08-31,1998-08-31,1999-08-31,1998-08-31,,,GOWIE PLUMBING,7188821281,MASTER PLUMBER,1137,Y,,,,,,GOWIE PLUMBING,GOWIE PLUMBING,,,,,,,,,,,,2016-01-03,40.851661,-73.906937,14,241,2028070015,Mount Hope ...
4,BRONX,2084155,565,WEST 235 ST,201119173,2,A2,Y,5794,484,208,10463,2,,,,MH,ISSUED,INITIAL,EW,1,MH,,NONE,2007-04-30,2007-04-30,2008-01-08,2007-04-30,,,THE DU-RITE INC,2013877000,GENERAL CONTRACTOR,9872,Y,,,,,,THE DU-RITE INC,THE DU-RITE INC,,,,,,,,,,,,2016-01-03,40.88572,-73.91027,11,297,2057940484,Spuyten Duyvil-Kingsbridge ...
5,BRONX,2012264,606,EAST FORDHAM RD,200089251,2,A1,,3078,16,206,10458,2,,,,EQ,ISSUED,INITIAL,EQ,1,FN,,NONE,1994-04-22,1994-04-22,1995-04-22,1994-04-22,,,VARGAS ASSOCIATES,9149692853,PROFESSIONAL ENGINEER,56795,Y,,,,,,VARGAS ASSOCIATES,VARGAS ASSOCIATES,,,,,,,,,,,,2016-01-03,40.858324,-73.884836,15,389,2030780016,Belmont ...
6,BRONX,2103486,730,CONCOURSE VILLAGE WEST,200896762,2,NB,Y,2443,78,204,10451,2,,,,PL,ISSUED,RENEWAL,PL,2,,,NONE,2008-06-05,2008-06-05,2009-06-05,2007-06-12,,,WDF INCORPORATED,2126961124,MASTER PLUMBER,926,,,,,,,,,,,,,,,,,,,,2016-01-03,40.82231,-73.923829,17,61,2024430078,East Concourse-Concourse Village ...
7,BRONX,2000391,345,BROOK AVE,201015613,2,A2,Y,2286,36,201,10454,2,,,,PL,ISSUED,RENEWAL,PL,3,,,,2012-05-22,2012-05-22,2013-05-22,2008-10-31,,,OLYMPIC PLBG & HTG SVC IN,7185284001,MASTER PLUMBER,1580,,,,,,,,,,,,,,,,,,,,2016-01-03,40.81004,-73.917792,8,41,2022860036,Mott Haven-Port Morris ...
8,BRONX,2011594,4487,3 AVE,200348524,3,NB,Y,3051,45,206,10457,2,,,,PL,ISSUED,RENEWAL,PL,2,,,NONE,2008-04-02,2008-05-30,2009-05-30,2000-10-27,,,PROGRAM UNLIMITED PLUMBING,7182397630,MASTER PLUMBER,1056,,,,,,,,,,,,,,,,,,,,2016-01-03,40.853237,-73.891742,15,385,2030510045,Claremont-Bathgate ...
9,BRONX,2001106,575,WALTON AVE,200920049,2,A1,,2352,43,204,10451,2,,,,PL,ISSUED,RENEWAL,PL,3,,,NONE,2007-10-10,2007-10-10,2008-10-09,2005-07-01,,,"WASOFF PLUMBING UTILIITY CO,.INC",7183871400,MASTER PLUMBER,868,,,,,,,,,,,,,,,,,,,,2016-01-03,40.819464,-73.92811,8,63,2023520043,West Concourse ...


In [None]:
if not os.path.exists(_dataset.name): 
  os.makedirs(_dataset.name)
finalDF = ds_sub.to_df()
finalDF.to_csv("./"+_dataset.name+"/final.csv")

In [6]:
ds_sub=ds_full