In [1]:
# Generate triples parameter

#kgtk_path takes in the directory which contains the kgtk subgraph
kgtk_path = '/Users/rijulvohra/Documents/work/Novartis-ISI/kgtk_development/data/Q28885102'
output_filename = 'pharma_product_concat.tsv.gz'
triple_filename = 'pharma_triple.ttl'
triple_generation_log = 'pharma_log.txt'
properties_file_path = './properties.tsv'

# Load triples to blazegraph
wikibase_ui_port = '10001'
wikibase_sparql_port = '10002'
wikibase_proxy_port = '10003'
wikibase_qs_port = '10005'
wikibase_volume = '.'
docker_name = 'blazegraphpipeline'
create_new = False
stop_docker = "No"
blazegraph_image = 'wikibase/wdqs:0.3.10'
ttl_path = ''

#Parameterize whether you want to run just the generate_wikidata_triples part or loading to blazegraph part
only_gen_triples = False
only_load_triples = False
both_gen_and_load_triples = False

In [2]:
import os
import re
import subprocess
import gzip
import subprocess
import socket
import sys
import shutil
import time
dirname = os.path.abspath('')
kgtk_path = os.path.join(dirname,kgtk_path)
wikibase_volume = os.path.join(dirname,'volume')
print(kgtk_path)
print(wikibase_volume)

/Users/rijulvohra/Documents/work/Novartis-ISI/kgtk_development/data/Q28885102
/Users/rijulvohra/Documents/work/Novartis-ISI/blazegraph-load-Noartis-ISI-pipeline/blazegraph-load-pipeline/blazegraph_load_pipeline_2/volume


In [None]:
!python -m spacy download en_core_web_sm

### Generate Wikidata triples

In [None]:
def get_uncompressed_size(file):
    pipe_in = os.popen('gzip -l %s' % file)
    list_1 = pipe_in.readlines()
    list_2 = list_1[1].split()
    c , u , r , n = list_2
    return int(u)

In [None]:
%%time
def find_files(path):
    kgtk_files = []
    for file_name in os.listdir(path):
        if file_name.startswith('Q') and file_name.endswith('.gz'):
            if re.search('.statistics.tsv.gz',file_name) or re.search('.P279star.tsv.gz',file_name):
                continue
            if get_uncompressed_size(os.path.join(kgtk_path,file_name)):
                kgtk_files.append(os.path.join(kgtk_path,file_name))
    return kgtk_files
kgtk_files = find_files(kgtk_path)

In [None]:
%%time
def generate_concat_files(kgtk_files, path, output_filename):
    concat_input = ' '.join(kgtk_files)
    output_concat = os.path.join(kgtk_path,output_filename)
    !kgtk cat -o $output_concat -i $concat_input


In [None]:
##generate_wikidata_triples
#Run only generate triples
if only_gen_triples:
    generate_concat_files(kgtk_files, kgtk_path, output_filename)

    gen_triple_input = os.path.join(kgtk_path,output_filename)
    triple_output_save_path = os.path.join(kgtk_path,triple_filename)
    log_save_path = os.path.join(kgtk_path,triple_generation_log)
    !time cat $gen_triple_input | kgtk generate_wikidata_triples -ap aliases,alias -lp label -dp description \
                                                                 -pf $properties_file_path \
                                                                 -n 100000 \
                                                                 --debug \
                                                                 -gt yes -gz yes -w yes \
                                                                 -log $log_save_path > $triple_output_save_path
    !gzip -f $triple_output_save_path

### Load Triples

In [None]:
# Exception Functions
class PortInUseError(BaseException):
    """Base class for other exceptions"""
    def __init__(self,value):
        self.value = value


class DockerNameInUse(BaseException):

    def __init__(self,value):
        self.value = value


In [None]:
class BlazegraphLoad():
    def __init__(self,ttl_path,wikibase_ui_port,wikibase_sparql,wikibase_proxy,wikibase_qs,wikibase_volume,
                 create_new,docker_name,stop_docker,blazegraph_image):
        self.ttl_path = os.path.join(dirname,ttl_path)
        self.wikibase_ui_port = str(wikibase_ui_port)
        self.wikibase_sparql = str(wikibase_sparql)
        self.wikibase_proxy = str(wikibase_proxy)
        self.wikibase_qs = str(wikibase_qs)
        self.wikibase_volume = wikibase_volume
        self.create_new = create_new
        self.docker_name = docker_name
        self.stop_docker = stop_docker
        self.blazegraph_image = blazegraph_image
        os.environ['WIKIBASE_UI'] = self.wikibase_ui_port
        os.environ['WIKIBASE_SPARQL'] = self.wikibase_sparql
        os.environ['WIKIBASE_PROXY'] = self.wikibase_proxy
        os.environ['WIKIBASE_QS'] = self.wikibase_qs
        os.environ['WIKIBASE_VOLUME'] = self.wikibase_volume
        os.environ['BLAZEGRAPH_IMAGE'] = self.blazegraph_image

    @staticmethod
    def check_availability():

        wikibase_ui = os.getenv('WIKIBASE_UI')
        wikibase_sparql = os.getenv('WIKIBASE_SPARQL')
        wikibase_proxy = os.getenv('WIKIBASE_PROXY')
        wikibase_qs = os.getenv('WIKIBASE_QS')
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            wikibase_ui_usage = s.connect_ex(('localhost', int(wikibase_ui))) == 0
            wikibase_sparql_usage = s.connect_ex(('localhost', int(wikibase_sparql))) == 0
            wikibase_proxy_usage = s.connect_ex(('localhost', int(wikibase_proxy))) == 0
            wikibase_qs_usage = s.connect_ex(('localhost', int(wikibase_qs))) == 0
        docker_name_availability = subprocess.Popen(['docker', 'ps', '--filter', 'name={}'.format(docker_name)],
                                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        try:
            if create_new:
                if wikibase_ui_usage:
                    raise PortInUseError('Wikibase UI Port is in use')
                if wikibase_sparql_usage:
                    raise PortInUseError('Wikibase Sparql Port is in use')
                if wikibase_proxy_usage:
                    raise PortInUseError('Wikibase Proxy Port is in use')
                if wikibase_qs_usage:
                    raise PortInUseError('Wikibase QS Port is in use')
            if len(docker_name_availability.communicate()[0]) > 126:
                raise DockerNameInUse('Try changing docker container name')
            print(docker_name_availability)
        except PortInUseError as Argument:
            raise ('Error Message:', Argument)
            sys.exit(1)

        except DockerNameInUse as Argument:
            raise ('Error Message:', Argument)
            sys.exit(1)
        return True

    @staticmethod
    def load_data():
        l_data = subprocess.Popen(
            ['docker', 'exec', '{}_wdqs_1'.format(docker_name), '/wdqs/loadData.sh', '-n', 'wdq', '-d',
             '/instancestore/wikibase/mungeOut'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        print(l_data.communicate()[0])

    def driver_fn(self):
        if self.create_new:
            all_parameters = self.check_availability()
            if all_parameters:
                create_docker = subprocess.Popen(
                    ['docker-compose', '-f', 'docker-compose.pipeline.yml', '-p', docker_name, 'up', '-d'],
                    stdin=subprocess.PIPE, stdout=subprocess.PIPE)
                create_docker.communicate()

        if self.stop_docker == 'Yes' or self.stop_docker == 'yes':
            docker_stop = subprocess.Popen(
                ['docker-compose', '-f', 'docker-compose.pipeline.yml', '-p', docker_name, 'down', '-v'],
                stdin=subprocess.PIPE, stdout=subprocess.PIPE)
            docker_stop.communicate()
            sys.exit(1)

        if os.path.isdir(os.getenv('WIKIBASE_VOLUME') + '/mungeOut'):
            shutil.copy(ttl_path, os.path.join(os.getenv('WIKIBASE_VOLUME'), 'mungeOut/wikidump-000000001.ttl.gz'))
        else:
            os.makedirs(os.path.join(os.getenv('WIKIBASE_VOLUME'), 'mungeOut'))
            shutil.copy(ttl_path, os.path.join(os.getenv('WIKIBASE_VOLUME'), 'mungeOut/wikidump-000000001.ttl.gz'))

        time.sleep(40)

        self.load_data()
        os.remove(os.path.join(os.getenv('WIKIBASE_VOLUME'), 'mungeOut/wikidump-000000001.ttl.gz'))

In [None]:
# Run only load triples
if only_load_triples:
    loader_obj = BlazegraphLoad(ttl_path,wikibase_ui_port,wikibase_sparql_port,wikibase_proxy_port,wikibase_qs_port,
                                    wikibase_volume,create_new,docker_name,stop_docker,blazegraph_image)
    loader_obj.driver_fn()

In [None]:
# Run both the pipeline
if both_gen_and_load_triples:
    generate_concat_files(kgtk_files, kgtk_path, output_filename)

    gen_triple_input = os.path.join(kgtk_path,output_filename)
    triple_output_save_path = os.path.join(kgtk_path,triple_filename)
    log_save_path = os.path.join(kgtk_path,triple_generation_log)
    !time cat $gen_triple_input | kgtk generate_wikidata_triples -ap aliases,alias -lp label -dp description \
                                                                 -pf $properties_file_path \
                                                                 -n 100000 \
                                                                 --debug \
                                                                 -gt yes -gz yes -w yes \
                                                                 -log $log_save_path > $triple_output_save_path
    ttl_path = triple_output_save_path + '.gz'
    
    loader_obj = BlazegraphLoad(ttl_path,wikibase_ui_port,wikibase_sparql_port,wikibase_proxy_port,wikibase_qs_port,
                                    wikibase_volume,create_new,docker_name,stop_docker,blazegraph_image)
    loader_obj.driver_fn()