# Create data for fullchain tests

See: finnet-pipeline/docker-tests/fullchain/create_data.py

To add packages, append to `dags/requirements_py3.txt` and run `!pip3 install -r /usr/local/dags/requirements_py3.txt`

In [1]:
# !pip3 install -r /usr/local/dags/requirements_py3.txt

## Init Spark

In [2]:
import findspark
findspark.init("/usr/local/spark")

In [3]:
from pyspark import SparkContext

### Stop current SC, test assumes no existing SC
sc = SparkContext.getOrCreate()
sc.stop()

## Imports and Env

In [4]:
import sys
sys.path.insert(0, "/usr/local/dags")

import json
import os
import re

from pyspark import SparkConf
from pyspark.sql import SQLContext, SparkSession

from fncore_py3.utils.graph_specification import GraphSpec
from fncore_py3.utils.spark_tools import get_spark_context

In [5]:
DATA_PATH = "/datasets/finnet"
DATA_FORMAT = "parquet"
LOCAL_DATA_PATH = os.path.join(os.getcwd(), "data")

config = dict()
config['SparkConfiguration'] = (
    SparkConf()
    .set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation","true")
)

hiveFolderSep = "__"
hiveDBName = "default"

os.environ["GRAPH_DB"] = """bolt://neo4j:test@neo4j:7687"""

## Get graph specs

In [6]:
# Purge folder
!hdfs dfs -rm -r $DATA_PATH

Deleted /datasets/finnet


In [7]:
data_list = os.listdir(LOCAL_DATA_PATH)
json_list = [k for k in data_list if re.match(r'.*\.json$', k)]

In [8]:
with get_spark_context(config['SparkConfiguration']) as spark_ctx:
    spark = SparkSession.builder.enableHiveSupport().getOrCreate()
    
    # Read in the graph spec
    for graph_spec in json_list:
        with open(os.path.join(LOCAL_DATA_PATH, graph_spec), 'r') as f:
            spec = json.load(f)
            spec_model = GraphSpec.from_dict(spec)

        tables = spec_model.table_details
        graph_name = spec_model.name

        # Read the sample data and put into hdfs
        for table, columns in tables['tables'].items():
            source_table, safe_table = table
            filepath = 'file://' + \
                       os.path.join(LOCAL_DATA_PATH, str(source_table)) + \
                       '.csv'
            data = spark.read.format('com.databricks.spark.csv')\
                              .option('header', 'true')\
                              .option('inferschema', 'false')\
                              .load(filepath)
            
            outdatapath = os.path.join(
                DATA_PATH, graph_name, 'tables', safe_table
            )
            data.write.format(DATA_FORMAT)\
                .mode(saveMode='overwrite')\
                .save(outdatapath)

#             hivetablename = ".".join([hiveDBName,hiveFolderSep.join([graph_name, 'tables', safe_table])])
#             print("Writing {}".format(hivetablename))
#             data.write.format(DATA_FORMAT).mode("overwrite").saveAsTable(hivetablename)
            
#     spark.sql("SHOW TABLES").show(truncate=False)

In [9]:
spec_model.edge_lists

[<fncore_py3.utils.graph_specification.EdgeListSpec at 0x7ff25d629c88>,
 <fncore_py3.utils.graph_specification.EdgeListSpec at 0x7ff25d6340f0>,
 <fncore_py3.utils.graph_specification.EdgeListSpec at 0x7ff25d634518>]

In [10]:
edge_kind = spec_model.edge_lists[2]

In [11]:
(edge_kind.source_column.name, edge_kind.source_labels, edge_kind.source_metadata_columns)

('toffee_s',
 [],
 [<fncore_py3.utils.graph_specification.ColumnSpec at 0x7ff25d634b38>])

In [12]:
edge_kind.source_metadata_columns[0].safe_name

'fn_src_meta'

In [13]:
(edge_kind.target_column.name, edge_kind.target_labels, edge_kind.target_metadata_columns)

('toffee_t', ['is_target'], [])

In [14]:
spec_model.table_details

{'connection': 'data_uri_value',
 'poll_frequency': '0 2 * * *',
 'tables': {('test_data_chocolate_node_list',
   'fn_test_data_chocolate_node_list'): {('id', 'fn_id')},
  ('test_data_sweets_node_list',
   'fn_test_data_sweets_node_list'): {('id', 'fn_id'), ('prop', 'fn_prop')},
  ('test_data_toffee_node_list',
   'fn_test_data_toffee_node_list'): {('hide', 'fn_hide'),
   ('id', 'fn_id'),
   ('prop', 'fn_prop')},
  ('test_data_chocolate_edge_list',
   'fn_test_data_chocolate_edge_list'): {('chocolate_s',
    'fn_chocolate_s'), ('chocolate_t', 'fn_chocolate_t')},
  ('test_data_sweets_edge_list',
   'fn_test_data_sweets_edge_list'): {('sweets_s', 'fn_sweets_s'), ('sweets_t',
    'fn_sweets_t')},
  ('test_data_toffee_edge_list',
   'fn_test_data_toffee_edge_list'): {('edge_label',
    'fn_edge_label'), ('edge_prop', 'fn_edge_prop'), ('src_meta',
    'fn_src_meta'), ('toffee_s', 'fn_toffee_s'), ('toffee_t', 'fn_toffee_t')}}}

In [15]:
# "hidden" field in json is ignored as designed
assert{spec_model.node_lists[0].index_column.hidden == False}
spec["node_lists"][0]

{'table_name': 'test_data_chocolate_node_list',
 'name': 'chocolate nodes',
 'labels': ['chocolate'],
 'index_column': {'resolution_alias': 'chocolate',
  'variable_definition': 'String',
  'name': 'id',
  'unrecognized-key': 'value',
  'hidden': 'True'}}

In [16]:
spec_model.edge_lists[0].merge_same

True