In [65]:
import json
import math
import re

#import ckanapi
import geopandas as gpd
import nltk
import numpy as np
import pandas as pd
import requests

from nltk.corpus import wordnet
from shapely.geometry import shape
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from datetime import datetime as dt

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [72]:
def score_usability(columns, data):
    '''
        How easy is it to use the data given how it is organized/structured?
        
        TODO's: 
            * level of nested fields?
            * long vs. wide?
            * if ID columns given, are these ID's common across datasets?
    '''
    
    def parse_col_name(s):
        camel_to_snake = re.sub(
            '([a-z0-9])([A-Z])', 
            r'\1_\2', 
            re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
        ).lower()

        return camel_to_snake == s, [x for x in re.split('-|_|\s', camel_to_snake) if len(x)]

    metrics = {
        'col_names': 0, # Are the column names easy to understand?
        'col_constant': 1 # Are there columns where all values are constant?
    }
    
    for f in columns:
        is_camel, words = parse_col_name(f)
        eng_words = [ w for w in words if len(wordnet.synsets(w)) ]

        if len(eng_words) / len(words) > 0.8:
            metrics['col_names'] += (1 if not is_camel else 0.5) / len(columns)
        
        if not f == 'geometry' and data[f].nunique() <= 1:
            metrics['col_constant'] -= 1 / len(columns)
    
    if isinstance(data, gpd.GeoDataFrame):
        counts = data['geometry'].is_valid.value_counts()
        
        metrics['geo_validity'] = 1 - (counts[False] / (len(data) * 0.05)) if False in counts else 1
    
    return np.mean(list(metrics.values()))

In [73]:
data = pd.read_csv("/mnt/data/DOB_NOW__Certificate_of_Occupancy.csv")
#data.index = data["Index Title"]
data.head()

Unnamed: 0,JOB FILING NAME,JOB TYPE,BIN,BOROUGH,HOUSE NO,STREET NAME,BLOCK,LOT,ZIP CODE,SUBMITTED DATE,...,longitude,communityDistrict,communityDistrictBoroughCode,communityDistrictNumber,cityCouncilDistrict,censusTract2010,buildingIdentificationNumber,bbl,nta,ntaName
0,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/25/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
1,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/27/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
2,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,05/03/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
3,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,08/13/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
4,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,11/16/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan


In [74]:
cols = data.columns
cols
data[cols]

Unnamed: 0,JOB FILING NAME,JOB TYPE,BIN,BOROUGH,HOUSE NO,STREET NAME,BLOCK,LOT,ZIP CODE,SUBMITTED DATE,...,longitude,communityDistrict,communityDistrictBoroughCode,communityDistrictNumber,cityCouncilDistrict,censusTract2010,buildingIdentificationNumber,bbl,nta,ntaName
0,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/25/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
1,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/27/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
2,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,05/03/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
3,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,08/13/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
4,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,11/16/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14261,I1,New Building,5863165,STATEN ISLAND,1,EVENTS PLAZA,9999.0,1.0,10301.0,02/11/2022 12:00:00 AM,...,,,,,,,,,,
14262,I1,New Building,5863165,STATEN ISLAND,1,EVENTS PLAZA,9999.0,1.0,10301.0,02/25/2022 12:00:00 AM,...,,,,,,,,,,
14263,I1,New Building,5863165,STATEN ISLAND,1,EVENTS PLAZA,9999.0,1.0,10301.0,08/06/2021 12:00:00 AM,...,,,,,,,,,,
14264,I1,New Building,5863165,STATEN ISLAND,1,EVENTS PLAZA,9999.0,1.0,10301.0,12/10/2021 12:00:00 AM,...,,,,,,,,,,


In [75]:
#data['index_col'] = data.index
data.head()

Unnamed: 0,JOB FILING NAME,JOB TYPE,BIN,BOROUGH,HOUSE NO,STREET NAME,BLOCK,LOT,ZIP CODE,SUBMITTED DATE,...,longitude,communityDistrict,communityDistrictBoroughCode,communityDistrictNumber,cityCouncilDistrict,censusTract2010,buildingIdentificationNumber,bbl,nta,ntaName
0,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/25/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
1,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/27/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
2,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,05/03/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
3,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,08/13/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
4,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,11/16/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan


In [76]:
score_usability(cols, data)

0.8362068965517242