In [None]:
import pandas as pd
import polars as pl
import numpy as np
import networkx as nx
import json
import time
import os
from rtsvg import *
rt = RACETrack()

In [None]:
_xform_map_ = '''
__id__              = '$[*]._id'              | MovieID      | uniq
__director__        = '$[*].director.name_id' | DirectorID   | uniq
__castmember__      = '$[*].cast.[*].name_id' | CastMemberID | uniq
__director__        --- "directedMovie"  --- __id__                                              ^^^ "imdb_600k_international_movies"
__id__              --- "hasLabel"       --- '$[*].name'                   | xsd:string          ^^^ "imdb_600k_international_movies"
__id__              --- "yearReleased"   --- fixYear('$[*].year')          | xsd:date     | yyyy ^^^ "imdb_600k_international_movies"
__id__              --- "runTime"        --- '$[*].runtime'                | xsd:duration | dura ^^^ "imdb_600k_international_movies"
__id__              --- "hasGenre"       --- stripString('$[*].genre[*]')  | xsd.string   | cata ^^^ "imdb_600k_international_movies"
__id__              --- "ratingValue"    --- '$[*].ratingValue'            | xsd:float    | valu ^^^ "imdb_600k_international_movies"
__id__              --- "summary"        --- '$[*].summary_text'           | xsd:string   | cont ^^^ "imdb_600k_international_movies"
__director__        --- "hasLabel"       --- '$[*].director.name'          | xsd:string   | ambi ^^^ "imdb_600k_international_movies"
__castmember__      --- "castMemberOf"   --- __id__                                              ^^^ "imdb_600k_international_movies"
__castmember__      --- "hasLabel"       --- '$[*].cast.[*].name'          | xsd:string   | ambi ^^^ "imdb_600k_international_movies"
'''

def fixYear(s):
    if ') (' in s:                  s = s[s.rindex(') (')+3:]
    if   s.endswith('TV Special'):  return s[:s.rindex('TV Special')].strip()
    elif s.endswith('TV Movie'):    return s[:s.rindex('TV Movie')].strip()
    elif s.endswith('TV Short'):    return s[:s.rindex('TV Short')].strip()
    if len(s) == 4:
        for i in range(len(s)):
            if s[i] not in '0123456789': 
                return None
        return s
    return None

def stripString(s): return s.strip()
fns = {'stripString': stripString,
       'fixYear':     fixYear}

#
# 32.8s end-to-end now...
#
_base_ = '../../../data/kaggle_imdb_600k/international-movies-json/'
_files_ = os.listdir(_base_)
print(f'Loading {len(_files_)} files...')
_jsons_ = []
for i in range(len(_files_)):
    _file_ = _files_[i]
    _txt_  = open(_base_ + _file_).read()
    _json_ = json.loads(_txt_)
    _jsons_.append(_json_)

print('Applying template to json...')
ofv = rt.ontologyFrameworkInstance(xform_spec=_xform_map_, labeling_verbs=set(['hasLabel']), funcs=fns)
ofv.parse(_jsons_)
t1 = time.time()

# Write to disk
print('Writing to disk...')
ofv.to_files('../../../data/kaggle_imdb_600k/20240519_ontology')

for k, k_df in ofv.df_triples.group_by(['vrb']):
    print(k, len(k_df))

In [None]:
_example_run_ = '''
('ratingValue',)    633719
('runTime',)        633719
('hasGenre',)       824509
('hasLabel',)      3202774
('castMemberOf',)  1977961
('yearReleased',)   633719
('directedMovie',)  591094
('summary',)        633719
'''
rt.tile([rt.histogram(ofv.df_triples, bin_by='vrb', color_by='stype', h=180, w=128),
         rt.histogram(ofv.df_triples, bin_by='stype', color_by='stype', h=180, w=128)])

In [None]:
len(ofv.df_triples)

In [None]:
print(len(ofv.validation_errors))
ofv.validation_errors