In [1]:
#%sh
#conda install python-graphviz

In [2]:
import pandas as pd
import json
from graphviz import Graph
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

%matplotlib inline

In [3]:
#Serealize result to json
class ObjectEncoder(json.JSONEncoder):
    def default(self, obj):
        if hasattr(obj, "to_json"):
            return self.default(obj.to_json())
        elif hasattr(obj, "__dict__"):
            d = dict(
                (key, value)
                for key, value in inspect.getmembers(obj)
                if not key.startswith("__")
                and not inspect.isabstract(value)
                and not inspect.isbuiltin(value)
                and not inspect.isfunction(value)
                and not inspect.isgenerator(value)
                and not inspect.isgeneratorfunction(value)
                and not inspect.ismethod(value)
                and not inspect.ismethoddescriptor(value)
                and not inspect.isroutine(value)
            )
            return self.default(d)
        return obj

In [4]:
# Read schema_discovery json object located in given path              
def get_schema_discovery(schema_discovery_file_location): 
    spark.conf.set("fs.azure.account.key.homecredittest01.blob.core.windows.net", dbutils.secrets.get(scope = "blobs", key = "hcaccesskey"))
    dbutils.fs.cp(schema_discovery_file_location, "/dbfs/tmp/schema_discovery.json")

    with open("/dbfs/tmp/schema_discovery_with_entities_and_dependencies.json", 'r', encoding='utf-8') as f:
        schema_discovery = json.load(f)
        
    return schema_discovery

# Read schema_discovery object    
schema_discovery_file_location = "wasbs://hc-test-data-01@homecredittest01.blob.core.windows.net/hc-test-01/schema_discovery.json"
schema_discovery = get_schema_discovery(schema_discovery_file_location)

In [5]:
for table in schema_discovery['tables']:
  print(table['name'])

In [6]:
table_to_show = 'credit_card_balance'
data = []

for table in schema_discovery['tables']:
    if table['name'] == table_to_show:
        for col in table['columns']:
            pk_str = 'not PK'
            fk_str = 'not FK'
            if col['isPK']:
                pk_str = 'is PK - ' + col['PKsource']
            if col['isFK']:
                fk_str = 'is FK - ' + col['FKsource']
            data.append([col['name'], col['rawType'], col['type'], pk_str, fk_str])
  
df = pd.DataFrame(data=data, columns=['column_name', 'column_raw_type', 'column_type', 'is_PK', 'is_FK'])
df

In [7]:
def add_details_to_dep_dict(dep_dict, table_name, column_name):
    if table_name not in dep_dict:
        dep_dict[table_name] = []
    if column_name not in dep_dict[table_name]:
      dep_dict[table_name].append(column_name)

def extract_nodes_and_edges(schema_discovery):      
    dep_nodes_dict = {}
    dep_edges = []

    for dep in schema_discovery['dependencies']:
      dep_source = dep['dependencySource']
      table_name_1, column_name_1, cardi_1 = dep['left']['tableName'], dep['left']['columnName'], dep['left']['cardinalityType']
      table_name_2, column_name_2, cardi_2 = dep['right']['tableName'], dep['right']['columnName'], dep['right']['cardinalityType']

      add_details_to_dep_dict(dep_nodes_dict, table_name_1, column_name_1)
      add_details_to_dep_dict(dep_nodes_dict, table_name_2, column_name_2)
      dep_edges.append([table_name_1 + ':' + column_name_1, table_name_2 + ':' + column_name_2, dep_source, cardi_1, cardi_2])
      
    return dep_nodes_dict, dep_edges
  
def create_dependency_graph(dep_nodes_dict, dep_edges):  
    s = Digraph('structs', node_attr={'shape': 'record'}, edge_attr={'tailclip': 'false'}, format='png')
    s.node('anoot',r'{{Discovered - Blue}|{Metadata - Red}}')
    
    for table_node in dep_nodes_dict:
      node_str = r'{%s |' % (table_node)
      for idx, col_name in enumerate(dep_nodes_dict[table_node]):
        node_str += '{<%s> %s}' % (col_name, col_name)
        if idx < len(dep_nodes_dict[table_node]) - 1:
          node_str += '|'

      node_str += '}'
      s.node(table_node, node_str)

    colors_source_dict = {'Metadata': 'red', 'Discovered': 'blue'}
    shape_cardi_dict = {'Many': 'crow', 'One': 'tee'}
    for edge in dep_edges:
      s.edge(edge[0], edge[1], color=colors_source_dict[edge[2]], arrowhead=shape_cardi_dict[edge[3]], arrowtail=shape_cardi_dict[edge[3]], dir='both') #, label=edge[2]

    s.view('p') 

dep_nodes_dict, dep_edges = extract_nodes_and_edges(schema_discovery)
create_dependency_graph(dep_nodes_dict, dep_edges)

img = mpimg.imread('p.png')
fig, ax = plt.subplots(figsize=(12, 12))
ax.imshow(img)
plt.show()

In [8]:
g = nx.DiGraph()
for dep in schema_discovery['dependencies']:
    table_name_1 = dep['left']['tableName']
    table_name_2 = dep['right']['tableName']
    relationship_col_1 = dep['left']['relationshipType']
    relationship_col_2 = dep['left']['relationshipType']
    
    if ((relationship_col_1 == 'Equel') & (relationship_col_2 == 'Equel')) | (relationship_col_1 == 'Overlap') & (relationship_col_2 == 'Overlap'):
        g.add_edge(table_name_1, table_name_2)
        g.add_edge(table_name_2, table_name_1)
#         continue
        print('EQUAL OR OVERLAP ' + table_name_1 + ' ' + table_name_2 + ' ' + relationship_col_1 + ' ' + relationship_col_2)
    elif (relationship_col_1 == 'Contains') & (relationship_col_2 == 'Contained'):
        g.add_edge(table_name_1, table_name_2)
        print('Contains & Contained ' + table_name_1 + ' ' + table_name_2)
    elif (relationship_col_1 == 'Contained') & (relationship_col_2 == 'Contains'):
        g.add_edge(table_name_2, table_name_1)
        print('Contains & Contained ' + table_name_2 + ' ' + table_name_1)
    else:
        print('NOR FOUND ' + table_name_1 + ' ' + table_name_2 + ' ' + relationship_col_1 + ' ' + relationship_col_2)
      
      
in_degree_dict = g.in_degree

In [9]:
in_degree_dict