In [2]:
import json
import math
import re

#import ckanapi
import geopandas as gpd
import nltk
import numpy as np
import pandas as pd
import requests

from nltk.corpus import wordnet
from shapely.geometry import shape
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from datetime import datetime as dt

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def score_usability(columns, data):
    '''
        How easy is it to use the data given how it is organized/structured?
        
        TODO's: 
            * level of nested fields?
            * long vs. wide?
            * if ID columns given, are these ID's common across datasets?
    '''
    
    def parse_col_name(s):
        camel_to_snake = re.sub(
            '([a-z0-9])([A-Z])', 
            r'\1_\2', 
            re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
        ).lower()

        return camel_to_snake == s, [x for x in re.split('-|_|\s', camel_to_snake) if len(x)]

    metrics = {
        'col_names': 0, # Are the column names easy to understand?
        'col_constant': 1 # Are there columns where all values are constant?
    }
    
    for f in columns:
        is_camel, words = parse_col_name(f)
        eng_words = [ w for w in words if len(wordnet.synsets(w)) ]

        if len(eng_words) / len(words) > 0.8:
            metrics['col_names'] += (1 if not is_camel else 0.5) / len(columns)
        
        if not f == 'geometry' and data[f].nunique() <= 1:
            metrics['col_constant'] -= 1 / len(columns)
    
    if isinstance(data, gpd.GeoDataFrame):
        counts = data['geometry'].is_valid.value_counts()
        
        metrics['geo_validity'] = 1 - (counts[False] / (len(data) * 0.05)) if False in counts else 1
    
    return np.mean(list(metrics.values()))

In [85]:
dob = pd.read_csv("/mnt/data/DOB_NOW__Certificate_of_Occupancy.csv")
#data.index = data["Index Title"]
dob.head()

Unnamed: 0,JOB FILING NAME,JOB TYPE,BIN,BOROUGH,HOUSE NO,STREET NAME,BLOCK,LOT,ZIP CODE,SUBMITTED DATE,...,longitude,communityDistrict,communityDistrictBoroughCode,communityDistrictNumber,cityCouncilDistrict,censusTract2010,buildingIdentificationNumber,bbl,nta,ntaName
0,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/25/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
1,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/27/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
2,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,05/03/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
3,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,08/13/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
4,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,11/16/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan


In [86]:
cols = dob.columns
cols
dob[cols]

Unnamed: 0,JOB FILING NAME,JOB TYPE,BIN,BOROUGH,HOUSE NO,STREET NAME,BLOCK,LOT,ZIP CODE,SUBMITTED DATE,...,longitude,communityDistrict,communityDistrictBoroughCode,communityDistrictNumber,cityCouncilDistrict,censusTract2010,buildingIdentificationNumber,bbl,nta,ntaName
0,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/25/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
1,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/27/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
2,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,05/03/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
3,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,08/13/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
4,01,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,11/16/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1.000020e+09,MN25,Battery Park City-Lower Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14261,I1,New Building,5863165,STATEN ISLAND,1,EVENTS PLAZA,9999.0,1.0,10301.0,02/11/2022 12:00:00 AM,...,,,,,,,,,,
14262,I1,New Building,5863165,STATEN ISLAND,1,EVENTS PLAZA,9999.0,1.0,10301.0,02/25/2022 12:00:00 AM,...,,,,,,,,,,
14263,I1,New Building,5863165,STATEN ISLAND,1,EVENTS PLAZA,9999.0,1.0,10301.0,08/06/2021 12:00:00 AM,...,,,,,,,,,,
14264,I1,New Building,5863165,STATEN ISLAND,1,EVENTS PLAZA,9999.0,1.0,10301.0,12/10/2021 12:00:00 AM,...,,,,,,,,,,


In [87]:
#data['index_col'] = data.index
dob.head()

Unnamed: 0,JOB FILING NAME,JOB TYPE,BIN,BOROUGH,HOUSE NO,STREET NAME,BLOCK,LOT,ZIP CODE,SUBMITTED DATE,...,longitude,communityDistrict,communityDistrictBoroughCode,communityDistrictNumber,cityCouncilDistrict,censusTract2010,buildingIdentificationNumber,bbl,nta,ntaName
0,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/25/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
1,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,01/27/2022 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
2,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,05/03/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
3,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,08/13/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan
4,1,ALTERATION TYPE 1,1000003,MANHATTAN,10,SOUTH STREET,2.0,2.0,10004.0,11/16/2021 12:00:00 AM,...,-74.011631,101.0,1.0,1.0,1.0,9.0,1000003.0,1000020000.0,MN25,Battery Park City-Lower Manhattan


In [88]:
score_usability(cols, dob)

0.8362068965517242

In [8]:
## NYPD MOS data
mos = pd.read_csv("/mnt/data/NYPD_OIP_Officer_MOS.csv")
#data.index = data["Index Title"]
mos.head()

Unnamed: 0,PROFILE_ID,RANK,COMMAND,APPOINTMENT_DATE,ARRESTS_TOTAL,DEPARTMENT_RECOGNITIONS,EXPORT_DATE
0,EFA48BB6-F2F7-4717-A8A5-5B0C37EA9F5E,POLICE OFFICER,107 PRECINCT,01/06/2016 12:00:00 AM,55,2,06/12/2022 12:00:00 AM
1,2DE128A9-EF16-4D06-AAA3-CCC5DF5E301E,DETECTIVE 3RD GRADE,062 DET SQUAD,04/15/1997 12:00:00 AM,758,3,06/12/2022 12:00:00 AM
2,26C5E7DC-A575-402D-B232-D9FADC267E6A,POLICE OFFICER,049 PRECINCT,10/07/2019 12:00:00 AM,25,0,06/12/2022 12:00:00 AM
3,EABFF759-433D-458C-92FB-CCD1AE994296,POLICE OFFICER,123 PRECINCT,07/11/2005 12:00:00 AM,111,0,06/12/2022 12:00:00 AM
4,973DB185-30D4-498D-BB57-9213A7976621,DETECTIVE SPECIALIST,HOUSING PSA 6,01/09/2012 12:00:00 AM,318,4,06/12/2022 12:00:00 AM


In [9]:
mos_cols = mos.columns
mos_cols

Index(['PROFILE_ID', 'RANK', 'COMMAND', 'APPOINTMENT_DATE', 'ARRESTS_TOTAL',
       'DEPARTMENT_RECOGNITIONS', 'EXPORT_DATE'],
      dtype='object')

In [10]:
score_usability(mos_cols, mos)

0.9285714285714285

In [93]:
## NYPD shield data
shield = pd.read_csv("/mnt/data/NYPD_OIP_Title_Shield_History.csv")
#data.index = data["Index Title"]
shield.head()

Unnamed: 0,PROFILE_ID,EFFECTIVE_DATE,TITLE,SHIELD_NO,EXPORT_DATE
0,D2CA5F5E-BDC5-44B2-9D32-30BC27556B87,03/28/2013 12:00:00 AM,SERGEANT,3808.0,06/12/2022 12:00:00 AM
1,75F6539C-B763-4BAE-81E6-402F4278B21C,12/29/2021 12:00:00 AM,POLICE OFFICER,30680.0,06/12/2022 12:00:00 AM
2,7C8688B2-74E9-4FF2-8420-15F6C9C8B8E8,07/06/2010 12:00:00 AM,POLICE OFFICER,15262.0,06/12/2022 12:00:00 AM
3,DED25197-09D4-46AB-BB01-66E1394FDF39,12/23/2010 12:00:00 AM,SERGEANT,4883.0,06/12/2022 12:00:00 AM
4,87BD795E-F584-4214-8720-64CA794D63AE,07/11/2012 12:00:00 AM,POLICE OFFICER,30819.0,06/12/2022 12:00:00 AM


In [94]:
shield_cols = shield.columns

In [101]:
score_usability(shield_cols, shield)

0.9

In [95]:
## NYPD Recognition data
rec = pd.read_csv("/mnt/data/NYPD_OIP_Officer_Recognitions.csv")
#data.index = data["Index Title"]
rec.head()

Unnamed: 0,PROFILE_ID,DATE,AWARD,EXPORT_DATE
0,4139A1A3-2FF1-4751-86B1-E38E4F21F32E,2018-10-21,MERITORIOUS POLICE DUTY,2022-06-12
1,91A0F1C8-2C71-428A-A817-475FD49E08BA,2012-05-22,EXCELLENT POLICE DUTY,2022-06-12
2,1886F93C-E7E6-454E-9922-72E485D8B618,2015-03-16,EXCELLENT POLICE DUTY,2022-06-12
3,77456568-A0B2-4F0F-8B83-E49BCB145F97,2020-03-13,EXCELLENT POLICE DUTY,2022-06-12
4,DCEE6447-4E44-4EFB-B20A-B24C425B4DD8,2009-02-12,EXCELLENT POLICE DUTY,2022-06-12


In [96]:
rec_cols = rec.columns

In [102]:
score_usability(rec_cols, rec)

0.875

In [97]:
## NYPD charges data
charges = pd.read_csv("/mnt/data/NYPD_OIP_Disciplinary_History_Charges.csv")
#data.index = data["Index Title"]
charges.head()

Unnamed: 0,PROFILE_ID,DATE,CASE_NUMBER,CHARGE_DESCRIPTION,DISPOSITION,PENALTY_AND_QUANTITY,EXPORT_DATE
0,754926F2-F556-430B-80B7-609CD708E5E2,11/16/2015,2015-13874,FAILED TO PROVIDE HIS NAME AND SHIELD NUMBER U...,NOLO CONTENDRE,VACATION DAYS (5 day(s) ),06/12/2022
1,9E0A980F-B1F9-48EA-A69F-1A422F325D8A,10/01/2010,2010-157,WRONGFULLY OPERATED A MOTOR VEHICLE WHILE INTO...,GUILTY,"COUNSELING, DISMISSAL PROBATION (12 month(s) )...",06/12/2022
2,A3EBA3DB-126C-4B73-B95B-F98E5CB0C670,02/01/2016,2014-11707,WHILE SCHEDULED TO PERFORM A TOUR OF 0645 X 15...,GUILTY,"DISMISSAL PROBATION (12 month(s) ), FORFEITURE...",06/12/2022
3,6EAF68DB-44B3-48C8-ABDF-59B7731B3AE7,01/31/2015,2014-12780,WRONGFULLY OPERATED A MOTOR VEHICLE WHILE UNDE...,GUILTY,"COUNSELING, DISMISSAL PROBATION (12 month(s) )...",06/12/2022
4,091F9CAF-6ACF-40B0-99EC-A129B5040F7C,12/13/2016,2016-16768,FAILED AND NEGLECTED TO PROPERLY SAFEGUARD HIS...,GUILTY,VACATION DAYS (20 day(s) ),06/12/2022


In [98]:
charges_cols = charges.columns

In [103]:
score_usability(charges_cols, charges)

0.8571428571428571

In [99]:
## NYPD Charges Summary data
cha_sum = pd.read_csv("/mnt/data/NYPD_OIP_Disciplinary_History_Summary.csv")
#data.index = data["Index Title"]
cha_sum.head()

Unnamed: 0,PROFILE_ID,DATE,TOTAL_CHARGES,EXPORT_DATE
0,F10F798C-BB9F-416B-B1EE-B9BB098B04F4,05/21/2018,1,06/12/2022
1,A22FA880-81FF-4F56-9C40-20C7AEF6BD0E,04/03/2017,5,06/12/2022
2,69FEDC7B-DF29-4CF2-AE10-DCDA828E8832,03/10/2016,3,06/12/2022
3,CC5B5EF5-9F88-40C9-9FEB-5E984B7B5940,03/17/2021,2,06/12/2022
4,EA3DFABC-D46C-46F6-AD3E-48A841E9CCB0,05/10/2022,0,06/12/2022


In [100]:
cha_sum_cols = cha_sum.columns

In [104]:
score_usability(cha_sum_cols, cha_sum)

0.875

In [11]:
## dob facades compliane - BAD dataset test
bad = pd.read_csv("/mnt/data/DOB_NOW__Safety___Facades_Compliance_Filings.csv")
bad.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,TR6_NO,CONTROL_NO,FILING_TYPE,CYCLE,BIN,HOUSE_NO,STREET_NAME,BOROUGH,BLOCK,LOT,...,FILING_DATE,FILING_STATUS,PRIOR_CYCLE_FILING_DATE,PRIOR_STATUS,FIELD_INSPECTION_COMPLETED_DATE,QEWI_SIGNED_DATE,LATE_FILING_AMT,FAILURE_TO_FILE_AMT,FAILURE_TO_COLLECT_AMT,COMMENTS
0,TR6-913448-9A-N1,913448,Auto-Generated,9,4114712.0,143-45,SANFORD AVENUE,QUEENS,5049,38,...,,No Report Filed,,,,,11750.0,1000.0,0.0,
1,TR6-913451-9A-N1,913451,Auto-Generated,9,3393807.0,15,OLIVER STREET,BROOKLYN,6099,1,...,,No Report Filed,,,,,0.0,0.0,63400.0,
2,TR6-913456-9A-N1,913456,Auto-Generated,9,1077623.0,180,ELDRIDGE STREET,MANHATTAN,415,12,...,,No Report Filed,,,,,4250.0,0.0,0.0,
3,TR6-913458-9A-N1,913458,Auto-Generated,9,4001141.0,41-46,50 STREET,QUEENS,134,1,...,,No Report Filed,,,,,13250.0,2000.0,1000.0,
4,TR6-913460-9A-N1,913460,Auto-Generated,9,1088779.0,220,EAST 19 STREET,MANHATTAN,899,46,...,,No Report Filed,,,,,500.0,0.0,0.0,PHILIP DEANS - OWNER - PHN# 212-673-6262EMAIL:...


In [12]:
bad_cols = bad.columns
bad_cols

Index(['TR6_NO', 'CONTROL_NO', 'FILING_TYPE', 'CYCLE', 'BIN', 'HOUSE_NO',
       'STREET_NAME', 'BOROUGH', 'BLOCK', 'LOT', 'SEQUENCE_NO', 'SUBMITTED_ON',
       'CURRENT_STATUS', 'QEWI_NAME', 'QEWI_BUS_NAME', 'QEWI_BUS_STREET_NAME',
       'QEWI_CITY', 'QEWI_STATE', 'QEWI_ZIP', 'QEWI_NYS_LIC_NO', 'OWNER_NAME',
       'OWNER_BUS_NAME', 'OWNER_BUS_STREET_NAME', 'OWNER_CITY', 'OWNER_ZIP',
       'OWNER_STATE', 'FILING_DATE', 'FILING_STATUS',
       'PRIOR_CYCLE_FILING_DATE', 'PRIOR_STATUS',
       'FIELD_INSPECTION_COMPLETED_DATE', 'QEWI_SIGNED_DATE',
       'LATE_FILING_AMT', 'FAILURE_TO_FILE_AMT', 'FAILURE_TO_COLLECT_AMT',
       'COMMENTS'],
      dtype='object')

In [13]:
score_usability(bad_cols, bad)

0.7777777777777779

In [15]:
## COMPLETENESS TEST 
def score_completeness(data):
    '''
        How much of the data is missing?
    '''
    return 1 - (np.sum(len(data) - data.count()) / np.prod(data.shape))

In [89]:
score_completeness(dob)

0.9934616667552947

In [19]:
score_completeness(mos)

1.0

In [20]:
score_completeness(bad)

0.7313523021590278

In [None]:
datasets

In [34]:
### CALCULATING THE WEIGHT DIMENSIONS

def calculate_weights(dimensions, method='sr'):
    N = len(dimensions)
    
    if method == 'sr':
        denom = np.array([ ((1 / (i + 1)) + ((N + 1 - (i + 1)) / N)) for i, x in enumerate(dimensions) ]).sum()
        weights = [ ((1 / (i + 1)) + ((N + 1 - (i + 1)) / N)) / denom for i, x in enumerate(dimensions) ]
    elif method == 'rs':
        denom = np.array([ (N + 1 - (i + 1)) for i, x in enumerate(dimensions)]).sum()
        weights = [ (N + 1 - (i + 1)) / denom for i, x in enumerate(dimensions) ]
    elif method == 'rr':
        denom = np.array([ 1 / (i + 1) for i, x in enumerate(dimensions) ]).sum()
        weights = [ (1 / (i + 1)) / denom for i, x in enumerate(dimensions) ]
    elif method == 're':
        exp = 0.2
        denom = np.array([ (N + 1 - (i + 1)) ** exp for i, x in enumerate(dimensions) ]).sum()
        weights = [ (N + 1 - (i + 1)) ** exp / denom for i, x in enumerate(dimensions) ]
    else:
        raise Exception('Invalid weighting method provided')
    
    return weights

In [35]:
dimensions = ["usability", "metadata", "freshness", "completeness", "accessibility"]
calculate_weights(dimensions)

[0.37854889589905355,
 0.24605678233438483,
 0.17665615141955834,
 0.12302839116719241,
 0.07570977917981071]

In [40]:
RESOURCE_MODEL = 'scoring-models'
MODEL_VERSION = 'v0.1.0'

RESOURCE_SCORES = 'catalogue-scorecard'

DIMENSIONS = ['usability', 'metadata', 'freshness', 'completeness', 'accessibility'] # Ranked in order

BINS = {
    'Bronze': 0.6,
    'Silver': 0.8,
    'Gold': 1,
}

In [64]:
weights = calculate_weights(DIMENSIONS)
fw = {
        'aggregation_methods': {
            'metrics_to_dimension': 'avg',
            'dimensions_to_score': 'sum_and_reciprocal'
        },
        'dimensions': [
            {
                'name': dim,
                'rank': i + 1,
                'weights': wgt,
            } for i, (dim, wgt) in enumerate(zip(DIMENSIONS, weights))
        ],
        'bins': BINS
    }
fw

{'aggregation_methods': {'metrics_to_dimension': 'avg',
  'dimensions_to_score': 'sum_and_reciprocal'},
 'dimensions': [{'name': 'usability',
   'rank': 1,
   'weights': 0.37854889589905355},
  {'name': 'metadata', 'rank': 2, 'weights': 0.24605678233438483},
  {'name': 'freshness', 'rank': 3, 'weights': 0.17665615141955834},
  {'name': 'completeness', 'rank': 4, 'weights': 0.12302839116719241},
  {'name': 'accessibility', 'rank': 5, 'weights': 0.07570977917981071}],
 'bins': {'Bronze': 0.6, 'Silver': 0.8, 'Gold': 1}}

In [90]:
# initialize list of lists
dataset = [['dob', "extract_job", score_usability(dob), score_completeness(dob)], ['mos', "extract_job", score_usability(mos), score_completeness(mos)], ['bad', "not", score_usability(bad), score_completeness(bad)]]
 
# Create the pandas DataFrame
new_df = pd.DataFrame(dataset, columns=['package', 'resource', "usability", "completeness"])
 
# print dataframe.
new_df

TypeError: score_usability() missing 1 required positional argument: 'data'

In [81]:
def score_catalogue(event={}, context={}):
   
    weights = calculate_weights(DIMENSIONS)
    fw = {
        'aggregation_methods': {
            'metrics_to_dimension': 'avg',
            'dimensions_to_score': 'sum_and_reciprocal'
        },
        'dimensions': [
            {
                'name': dim,
                'rank': i + 1,
                'weights': wgt,
            } for i, (dim, wgt) in enumerate(zip(DIMENSIONS, weights))
        ],
        'bins': BINS
    }  
    
    df = pd.DataFrame(new_df).set_index(['package', 'resource'])

    scores = pd.DataFrame([weights] * len(df.index))
    scores.index = df.index
    scores.columns = DIMENSIONS

    #scores = df.multiply(scores)

    df['score'] = scores.sum(axis=1)
    df['score_norm'] = MinMaxScaler().fit_transform(df[['score']])

    df = df.groupby('package').mean()

    
    labels = list(BINS.keys())
    
    bins = [-1]
    bins.extend(BINS.values())
    
    df['grade'] = pd.cut(df['score_norm'], bins=bins, labels=labels)
    df['grade_norm'] = pd.cut(df['score_norm'], bins=bins, labels=labels)

    df['recorded_at'] = dt.now().strftime('%Y-%m-%dT%H:%M:%SZ')
    df['version'] = MODEL_VERSION

    df = df.reset_index()
    df = df.round(2)

    return df

In [83]:
scores = score_catalogue()
scores.head()

Unnamed: 0,package,score,score_norm,grade,grade_norm,recorded_at,version
0,bad,1.0,0.0,Bronze,Bronze,2022-06-27T19:49:14Z,v0.1.0
1,data,1.0,0.0,Bronze,Bronze,2022-06-27T19:49:14Z,v0.1.0
2,mos,1.0,0.0,Bronze,Bronze,2022-06-27T19:49:14Z,v0.1.0


In [77]:
df = pd.DataFrame(new_df)
scores = pd.DataFrame([weights] * len(df.index))
scores.index = df.index
scores.columns = DIMENSIONS
scores = df.multiply(scores)

df['score'] = scores.sum(axis=1)
df['score_norm'] = MinMaxScaler().fit_transform(df[['score']])

labels = list(BINS.keys())
    
bins = [-1]
bins.extend(BINS.values())
    
df['grade'] = pd.cut(df['score_norm'], bins=bins, labels=labels)
df['grade_norm'] = pd.cut(df['score_norm'], bins=bins, labels=labels)

df['recorded_at'] = dt.now().strftime('%Y-%m-%dT%H:%M:%SZ')
df['version'] = MODEL_VERSION

df

Unnamed: 0,package,resource,score,score_norm,grade,grade_norm,recorded_at,version
0,data,extract_job,0.0,0.0,Bronze,Bronze,2022-06-27T19:46:32Z,v0.1.0
1,mos,extract_job,0.0,0.0,Bronze,Bronze,2022-06-27T19:46:32Z,v0.1.0
2,bad,not,0.0,0.0,Bronze,Bronze,2022-06-27T19:46:32Z,v0.1.0
