In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import metrics
import networkx as nx
import community
import itertools
import seaborn as sns
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

%matplotlib inline

# %sh
# pip install python-louvain
# pip install sqlalchemy

In [2]:
# Connect to DB
# follows django database settings format
DATABASES = {'production':{'NAME': 'sandbox01',
                           'USER': 'datomize_admin@datomize-test',
                           'PASSWORD': 'sup3r-s3cr3t-d4t0m1z3',
                           'HOST': 'datomize-test.postgres.database.azure.com',
                           'PORT': '5432',
                          },}

# choose the database to use
db = DATABASES['production']

# construct an engine connection string
engine_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(user = db['USER'],
                                                                                          password = db['PASSWORD'],
                                                                                          host = db['HOST'],
                                                                                          port = db['PORT'],
                                                                                          database = db['NAME'],)

# create sqlalchemy engine
engine = create_engine(engine_string)

metadata = None
try:
  metadata = MetaData(bind=engine, reflect=True)
except:
  print('cant read metadata from DB')
  
data_path = '/dbfs/'

In [3]:
# Read data with given table.name as csv file and sample sample_size records if to_sample == True
def get_data(path_to_data, table, sample_size, to_sample=True):
    data = pd.read_csv(path_to_data)

    if to_sample:
      how_many_take = min(table.size, sample_size)
      sampled_records = data.sample(n=how_many_take)

      return sampled_records
    
    return data

In [4]:
# SchemaDiscoveryDTO stores a list of tables in the input connection (tables of type Table) and list of dependencies between 
# the tables
class SchemaDiscoveryDTO:
    def __init__(self, name):
        self.name = name
        self.tables = []
        self.dependencies = None
        
# TableDTO contains a name, schema name (e.g. public), list of columns and list of entities (each entity is list of columns)
class TableDTO:
    def __init__(self, table_name):       
        self.name = table_name
        self.columns = []  
        self.entities = None
        self.size = None
        
# A column contains column name, raw_type(int, float, object) and type(label/free text/ numeric/ timestamp/ identifier code)
class ColumnDTO:
    def __init__(self, col_name, col_raw_type, col_type, is_pk, pk_source, is_fk, fk_source):
        self.name = col_name
        self.rawType = col_raw_type
        self.type = col_type
        self.isPK = is_pk
        self.PKsource = pk_source
        self.isFK = is_fk
        self.FKsource = fk_source
        
        
# A Entity contains a list of columns 
class EntityDTO:
    def __init__(self):
        self.columns = []
        
# TableRefDTO contain name of the table, name of the column and its cardinality
class TableRefDTO:
    def __init__(self, table_name, column_name, cardinality_type):
        self.tableName = table_name
        self.columnName = column_name
        self.cardinalityType = cardinality_type

# DependencyDTO contain two TableRefDTO; left and right that represent the dependency between two columns and the source of the dependency (from metadata or founde by us)
class DependencyDTO:
    def __init__(self, table_left, table_right, dependency_source):
        self.left = table_left
        self.right = table_right
        self.dependencySource = dependency_source

In [5]:
def read_table_data(table_name, schema_discovery, engine, path_to_save_data):
    data = pd.read_sql_table(table_name, engine)
    if data.shape[0] > 1:
        data.to_csv(path_to_save_data, index=False)
        table = TableDTO(table_name)
        table.size = data.shape[0]
        schema_discovery.tables.append(table)
  
  
def get_schema_discovery_from_DB(engine, schema_name, data_path):  
    tables_names = engine.table_names() 
    schema_discovery = SchemaDiscoveryDTO(schema_name)
    threads_list = []
    tables_dict = {}
    for table_name in tables_names:
        path_to_save_data = data_path + table_name + '.csv'
        read_table_data(table_name, schema_discovery, engine, path_to_save_data)

    return schema_discovery

schema_discovery = get_schema_discovery_from_DB(engine, db['NAME'], data_path)

In [6]:
# An auxiliary function that accepts basic columns_data (name & raw data type per column) and a sample of records (based on these columns) and calculates the final column type (label/free text/ numeric/ timestamp/ identifier code)
def calc_col_types(columns_data, sampled_records, table_metadata=None):   
    # Calculates data types from sampled_records and updating columns_data accordingly
    sample_size = sampled_records.shape[0] # the maximal number of unique values
    long_str = 200 # A column containing strings with more than this number of characters will be considered free text column
    label_threshold = 0.2 # labels are expected to contain unique values up to this percent of the number of non empty values
    text_threshold = 0.8 # Free texts are expected to contain unique values of at least this percent of the number of non empty values
    idenifier_threshold = 4 # Free texts are expected to contain at least this number of digits

    res = columns_data.copy()
    res['raw_type'] = res['raw_type'].astype(str)

    # Add number of unique values per column
    res['unique_vals'] = sampled_records.nunique().values

    # Add number of non-null values per column
    res['not_null_cnt'] = sampled_records.count(axis=0).values

    res['min_digits'] = 0 
    res['max_digits'] = 0 
    res['is_bool'] = [False] * len(res)
    res['is_str'] = [False] * len(res)
    for col in sampled_records.columns:
        col_vals_as_str = sampled_records[col].dropna().astype(str)
        # Add minimal number of characters
        min_str_len = col_vals_as_str.str.len().min()
        res.loc[res['col_name']==col, 'min_digits'] = min_str_len

        # Add maximal number of characters
        max_str_len = col_vals_as_str.str.len().max()
        res.loc[res['col_name']==col, 'max_digits'] = max_str_len

        if ('True' in col_vals_as_str.values) | ('False' in col_vals_as_str.values):
            res.loc[res['col_name']==col, 'is_bool'] = True

        if not any(any(sub_str.isdigit() for sub_str in main_str) for main_str in col_vals_as_str.values):
            res.loc[res['col_name']==col, 'is_str'] = True

    res['col_type'] = res['raw_type'].copy()

    # Checks if the column is numeric
    is_numeric = np.vectorize(lambda x: True if any(sub_str in str(x) for sub_str in ['int', 'float']) else False)
    res['is_numeric'] = is_numeric(res['raw_type']) 

    # Checks if the column is int
    is_int = np.vectorize(lambda x: True if 'int' in str(x) else False)
    res['is_int'] = is_int(res['raw_type'])

    # Recognizing identifing keys as unique numbers or codes
    msk_id_code = ((res['is_str']==False) & (res['is_bool']==False) & (res['min_digits'] == res['max_digits']) &\
                   (res['min_digits']>=idenifier_threshold) & ((res['is_int']==True) | (res['raw_type'] == 'object')))
    res.loc[msk_id_code,'col_type'] = 'identifier code' 

    # Recognizing labels as repeating short texts
    msk_lable = ((res['is_bool']==False) & (res['col_type'] == res['raw_type']) & (res['unique_vals']<res['not_null_cnt']*label_threshold) &\
                 (res['raw_type'] == 'object'))
    res.loc[msk_lable,'col_type'] = 'label' 

    # Recognizing numeric columns as numeric columns that are not identifiers
    msk_numeric = (res['is_bool']==False) & (res['col_type'] == res['raw_type']) & (res['is_numeric'])
    res.loc[msk_numeric,'col_type'] = 'numeric'

    # Recognizing free texts as unique or long texts
    msk_free_text = ((res['is_bool']==False) & (res['col_type'] == res['raw_type']) & \
                     ((res['is_str']==True) | (((res['unique_vals']>=res['not_null_cnt']*text_threshold) | (res['max_digits']>=long_str)) & \
                                               (res['raw_type'] == 'object'))))
    res.loc[msk_free_text,'col_type'] = 'free text' 

    # Recognizing bool columns as bool columns
    msk_bool = (res['is_bool'])
    res.loc[msk_bool,'col_type'] = 'bool'
    
    # Recognizing dates
    msk_date = (res['raw_type'].str.contains('time'))
    res.loc[msk_date,'col_type'] = 'timestamp'
    
    # Recognizing Primery Key
    res['is_pk'] = False
    res['pk_source'] = 'None'
    if table_metadata != None:
        for col in [col for col in table_metadata.columns if ((col.name in columns_data['col_name'].values) & (col.primary_key))]:
            res.loc[res['col_name'] == col.name, 'is_pk'] = True
            res.loc[res['col_name'] == col.name, 'pk_source'] = 'Metadata'
    
    else:
        msk_pk = ((res['col_type']=='identifier code') & (res['unique_vals'] == res['not_null_cnt']) & (res['unique_vals'] == sample_size))
        res.loc[msk_pk, 'is_pk'] = True 
        res.loc[msk_pk, 'pk_source'] = 'Discovered' 

        num_pk = res.loc[res['is_pk'] == True].shape[0]

        # If no PK, search for combinations of identifier code columns that create x unqiue keys where x is the size of the data
        if num_pk == 0:
            id_code_cols = res.loc[(res['col_type'] == 'identifier code') & (res['not_null_cnt'] == sample_size)]['col_name'].values
            for pk_len in range(2, len(id_code_cols)+1):
                all_possible_combination = itertools.combinations(id_code_cols, pk_len)
                for pk_group in all_possible_combination:
                    num_unqiue_vals = len(sampled_records.groupby(list(pk_group)).groups)
                    if num_unqiue_vals == sample_size:
                        res.loc[res['col_name'].isin(pk_group), 'is_pk'] = True
                        res.loc[res['col_name'].isin(pk_group), 'pk_source'] = 'Discovered'
                        pk_len = len(id_code_cols)+1
                        break 
                        
        # If more then one column acts as PK (by itself, not as combination of columns) we need to choose only one of 
        # them as PK and the others will be regular identifier code columns (there is no additional information in keeping all as PK)
        elif num_pk > 1:
            all_pk_cols = res.loc[res['is_pk'] == True]['col_name']
            res.loc[res['col_name'].isin(all_pk_cols[1:]), 'is_pk'] = False
            res.loc[res['col_name'].isin(all_pk_cols[1:]), 'pk_source'] = 'None'
    
    return res

# Extracting the list of columns with column names and their raw data types from table with given table.table_name
def get_columns(table, path_to_data, sample_size, table_metadata=None):  
    sampled_records = get_data(path_to_data, table, sample_size)
  
    cols_df = pd.DataFrame({'col_name':sampled_records.dtypes.index.values, 'raw_type':sampled_records.dtypes.values})
    cols_data = calc_col_types(cols_df, sampled_records, table_metadata)
  
    table.columns = []
    for idx, col in cols_data.iterrows():
        # filter free text columns        
        if str(col['col_type']) != 'free text':
            table.columns.append(ColumnDTO(col['col_name'], str(col['raw_type']), str(col['col_type']), col['is_pk'], col['pk_source'], False, 'None'))
        
def get_tables_cols(schema_discovery, data_path, sample_size, metadata=None):    
    threads_list = []
    for table in schema_discovery.tables:
        if metadata != None:
            table_metadata = metadata.tables[table.name]
        else:
            table_metadata = None
        path_to_data = data_path + table.name + '.csv'      
        get_columns(table, path_to_data, sample_size, table_metadata)
        
get_tables_cols(schema_discovery, data_path, 1000, metadata)

In [7]:
# Transform all columns values to categorical values (e.g. from ['israel', 'usa', 'israel', 'spain'] to [1, 2, 1, 3])
def to_categorical(data, columns_names):
    le = preprocessing.LabelEncoder()
    categorical_cols = {}
    for col_name in columns_names:
        col_real_vals = data[col_name].values
        col_real_vals = ['None' if val is None else val for val in col_real_vals]
        categorical_cols[col_name] = le.fit_transform(col_real_vals)
    return categorical_cols

# Calculate correlation matrix; extracts internal dependencies between column pairs that appear in data
# Matrix corr_matrix contain the dependencies estimation between col_1 and col_2 (where col_1 != col_2) using mutual information measure
def get_correlation_matrix(table, data): 
    data_col_names = [col.name for col in table.columns]
    categorical_cols = to_categorical(data, data_col_names)
    all_possible_cols_combinations = [x for x in itertools.combinations(data_col_names, 2)]
    
    corr_matrix = pd.DataFrame(columns=data_col_names, 
                               index=data_col_names, 
                               data=np.zeros((len(data_col_names), len(data_col_names))))
    
    for col_tuple in all_possible_cols_combinations:
        col_1 = col_tuple[0]
        col_2 = col_tuple[1]
        
        cat_col_1 = categorical_cols[col_1]
        cat_col_2 = categorical_cols[col_2]
        
        # Calculate the Information Gain of target columns given the source column
        mutulal_info = metrics.normalized_mutual_info_score(cat_col_1, cat_col_2)
        corr_matrix.loc[col_1, col_2] = mutulal_info
        corr_matrix.loc[col_2, col_1] = mutulal_info

#     # plot heatmap of correlation matrix
#     sns.heatmap(corr_matrix, xticklabels=corr_matrix.columns, yticklabels=corr_matrix.columns, annot=True)

    return corr_matrix
    
    
# Create network graph using networkx package based on correlation matrix created by get_correlation_matrix function on given data
# when filter_col is True -> correlation value is set to 0 for columns tuple with correlation value lower then mean correlation value 
def get_network_graph(corr_matrix, filter_col=True):
    links = corr_matrix.stack().reset_index()
    links.columns = ['var1', 'var2','value']

    # Remove self correlation
    links_filtered = links.loc[links['var1'] != links['var2']]

    if filter_col:
        # Keep only correlation over a threshold (the mean correlation value)
        mean_corr = links_filtered.loc[links_filtered['value'] > 0]['value'].mean()
        links_filtered = links_filtered.loc[links_filtered['value'] > mean_corr]
        
    # Build the graph
    G = nx.from_pandas_edgelist(links_filtered, 'var1', 'var2', edge_attr='value')

#     # Plot the network:
#     nx.draw_circular(G, with_labels=True, node_size=200, font_size=10)

    return G

# Detects communities in the graph
# The communities detection method we use can be one of the next communities detection methods:
# 1. GN - girvan_newman 
# 2. best- best_partition (based on Louvain algorithm)
def detect_communities(G, community_method='best'):
    if community_method == 'GN':
        comp = girvan_newman(G)
        for communities in itertools.islice(comp, int(0.3*len(G.nodes))):
            clusters = tuple(sorted(c) for c in communities)

    elif community_method == 'best':
        communities = community.best_partition(G)

        clusters = []
        for i in range(len(G.nodes)):
            curr_cluster = [key for key, val in communities.items() if val == i]
            if len(curr_cluster) == 0:
                break
            clusters.insert(i, curr_cluster)

    return clusters

# Plot the entities we found as communities on network (features) graph
def plot_communty_network(G, clusters, with_labels=True):
    plt.figure(figsize=(8,8))
    pos = nx.spring_layout(G, k=2)
    node_colors = ['green', 'red', 'yellow', 'black', 'blue', 'orange', 'pink', 'purple', 'gray', 'brown']
  
#     edge_labels = nx.get_edge_attributes(G, 'value')
#     for key_tuple in edge_labels:
#       edge_labels[key_tuple] = round(edge_labels[key_tuple], 3)

#     nx.draw_networkx_edges(G, pos, width=1, alpha=0.9)
#     nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
   
    edges = G.edges()
    weights = [G[u][v]['value']*2 for u,v in edges]
    nx.draw_networkx_edges(G, pos, width=weights, alpha=0.9)
    
    for i in range(len(clusters)):
        cluster = clusters[i]
        color = node_colors[i%len(node_colors)]
        nx.draw_networkx_nodes(G, pos, nodelist=cluster, node_color=color, node_size=150)
        if with_labels:
            nx.draw_networkx_labels(G, pos, {x:x for x in cluster}, font_size=8)
    
    plt.savefig('foo.png')
    plt.show()

# Main function that uses all previous functions to detect main entities for each table in schema_discovery
def find_entities(schema_discovery, data_path, sample_size=10000):
    threads_list = []
    results_dict = {}
    for table in schema_discovery.tables:
        path_to_data = data_path + table.name + '.csv'
        sampled_records = get_data(path_to_data, table, sample_size)
        
        corr_matrix = get_correlation_matrix(table, sampled_records)
        G = get_network_graph(corr_matrix)
        clusters = detect_communities(G)
        plot_communty_network(G, clusters, data_path + 'plots/')
        print('finish finding clusters for table ' + table.name + ', num of clusters: ' + str(len(clusters)))
        
        table.entities = []
        for cluster in clusters:
            entity = EntityDTO()
            entity.columns = cluster
            table.entities.append(entity)
            
    return G
            
G = find_entities(schema_discovery, data_path)

In [8]:
# Concatinate number of columns to one unique PK column - done for compersion between PK of tables that build from number
# of columns
def combain_pk_columns(data, pk_list):
    data['concat_pk_col'] = data[[col.name for col in pk_list]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    pk_raw_types = [col.rawType for col in pk_list]
    pk_raw_types.sort()  
    new_pk_list = [ColumnDTO(name='concat_pk_col', 
                             rawType='_'.join(pk_raw_types), 
                             type='identifier code',
                             isPK=True,
                             PKsource='Discovered',
                             isFK=False,
                             FKsource='None')]
    
    return data, new_pk_list

# Get the cardinality of column given the data. cardinality can get one of the next values:
# '1': each unique value of this column appear only once
# 'M': there is at least one unqiue value of this column that appear more then one time
def get_cardinality(data, col_name):
    map_cardinalty = lambda x : 'One' if (x == 1) else 'Many' 
    
    cardinality = data.dropna(subset=[col_name]).groupby(col_name).count().max().max()
    cardinality = map_cardinalty(cardinality)
    
    return cardinality

# Search table object with given table_name in schema_discovery.tables_list. If there is no table with this name- return None
def get_table_by_name(schema_discovery, table_name):
    for table in schema_discovery.tables:
        if table.name == table_name:
            return table
    return None

# Change the type of each column that is a foreign key to 'FK' type
def set_col_keytype_to_FK(table, col_name, FK_source):
    for col in table.columns:
        if (col.name == col_name) & (col.isPK == False):
            col.isFK = True
            col.FKsource = FK_source

# Extracts external dependencies between tabels pairs
# dataframe connected_tabels contains names of the two connected attributes, their tabels names, and the type of 
# dependency (1:1, 1:M, M:1, M:N)
def find_foreign_keys(table_1, table_2, data_path):  
    tables_dependencies = []
  
    if table_1.size > table_2.size:
        table_temp = table_1
        table_1 = table_2
        table_2 = table_temp
        
    data_1 = get_data(data_path + table_1.name + '.csv', table_1, 0, to_sample=False)
    data_2 = get_data(data_path + table_2.name + '.csv', table_2, 1000, to_sample=True)

    pk_table_1 = [col for col in table_1.columns if col.isPK]
    pk_table_2 = [col for col in table_2.columns if col.isPK]
    identifier_code_cols_1 = [col for col in table_1.columns if ((col.type == 'identifier code') & (not col.isPK))]
    identifier_code_cols_2 = [col for col in table_2.columns if ((col.type == 'identifier code') & (not col.isPK))]
    
    if (len(pk_table_1) == 0) & (len(pk_table_2) == 0):
        return tables_dependencies
    
    # If the tables have diffrent size of PK we cant comper bwtween them -> check dependencies only between PK of size 1 and 
    # identifier code columns
    if (len(pk_table_1) != len(pk_table_2)):
        if (len(pk_table_1) > 1) & (len(pk_table_2) > 1):
            return tables_dependencies
        elif len(pk_table_1) > 1:
            pk_table_1 = []
        elif len(pk_table_2) > 1:
            pk_table_2 = []
    
    # If the tables have same size of PK then we can comper them- first we concatinate all pk columns to one unique column
    if (len(pk_table_1) == len(pk_table_2)) & (len(pk_table_1) > 1):
        data_1, pk_table_1 = combain_pk_columns(data_1, pk_table_1)
        data_2, pk_table_2 = combain_pk_columns(data_2, pk_table_2)    

    pk_identifier_code_cols_1 = pk_table_1 + identifier_code_cols_1
    pk_identifier_code_cols_2 = pk_table_2 + identifier_code_cols_2
    
    for col_1 in pk_identifier_code_cols_1:
        for col_2 in pk_identifier_code_cols_2:
            if (not col_1.isPK) & (not col_2.isPK):
                continue
            if col_1.rawType == col_2.rawType:
                col_name_1 = col_1.name + '_1'
                col_name_2 = col_2.name + '_2'                
                
                vals_col_1 = data_1[col_1.name].dropna().rename(col_name_1)
                vals_col_2 = data_2[col_2.name].dropna().rename(col_name_2)

                join_on_cols = pd.merge(vals_col_1, vals_col_2, left_on=col_name_1, right_on=col_name_2, how='left')
                join_on_cols = join_on_cols.dropna(subset=[col_name_2])
                
                if (join_on_cols.shape[0] == len(vals_col_2)) | (join_on_cols.shape[0] == len(vals_col_1)):
                    cardinality_col_1 = get_cardinality(data_1, col_1.name)
                    cardinality_col_2 = get_cardinality(data_2, col_2.name)
                    
                    ref_left = TableRefDTO(table_name=table_1.name, column_name=col_1.name, cardinality_type=cardinality_col_1)
                    ref_right = TableRefDTO(table_name=table_2.name, column_name=col_2.name, cardinality_type=cardinality_col_2)
                    dependency = DependencyDTO(ref_left, ref_right, dependency_source='Discovered')
                    tables_dependencies.append(dependency)
                    
                    set_col_keytype_to_FK(table_1, col_1.name, 'Discovered')
                    set_col_keytype_to_FK(table_2, col_2.name, 'Discovered')
                    
                    print(dependency.left.tableName+'.'+dependency.left.columnName+'_'+dependency.left.cardinalityType+'_'+'  '+\
                          dependency.right.tableName+'.'+dependency.right.columnName+'_'+dependency.right.cardinalityType+'  '+dependency.dependencySource)

    return tables_dependencies

                    
# Iterate over thr final dependencies_list and remove from it duplicate dependencies
def remove_duplicate_dependencies(dependencies_list):
    all_dep_hash = []
    new_dependencies_list = []
    for dep in dependencies_list:
        ref_hash_left = dep.left.tableName + '_' + dep.left.columnName 
        ref_hash_right = dep.right.tableName + '_' + dep.right.columnName
        
        left_right = ref_hash_left + '_' + ref_hash_right
        right_left = ref_hash_right + '_' + ref_hash_left
        
        if (left_right not in all_dep_hash) & (right_left not in all_dep_hash):
            new_dependencies_list.append(dep)
            all_dep_hash = all_dep_hash + [left_right, right_left]
        
    return new_dependencies_list

# Change all 'identifier code' type columns that didnt recognized as PK or FK to 'lable' type
def change_identifier_code_to_lable(schema_discovery):
    for table in schema_discovery.tables:
        for col in table.columns:
            if ((not col.isPK) & (not col.isFK) & (col.type == 'identifier code')):
                col.type = 'lable'
            
# Iterate over known dependencies and add them to dependencies_list that evntually will be returned and added to 
# schema_discovery.dependencies
def add_table_dependencies_from_metadata(table, table_metadata, schema_discovery, data_path):
    table_constraints = list(table_metadata.constraints)
    dependencies_list = []

    for i in range(len(table_constraints)):
        if type(table_constraints[i]) == sqlalchemy.sql.schema.ForeignKeyConstraint:
            table_columns_with_foreign_key = []
            
            # get foreign keys names as thay appear in the current table
            for col in table_constraints[i].columns:
                table_columns_with_foreign_key.append((str(col.table.name), col.name))
            
            # get foreign keys names as thay appear in the foreign table
            all_foreign_keys = [(elemnt.column.table.name, elemnt.column.name) for elemnt in table_constraints[i].elements]
            
            if len(table_columns_with_foreign_key) != len(all_foreign_keys):
                return dependencies_list
            
            # iterate over all pairs of columns we found (current table columns, foreign table colum)
            for i in range(len(table_columns_with_foreign_key)):
                table_col = table_columns_with_foreign_key[i]
                foreign_col = all_foreign_keys[i]
                
                foreign_table = get_table_by_name(schema_discovery, foreign_col[0])
                if foreign_table == None:
                    return dependencies_list
                
                table_data = get_data(data_path + table.name + '.csv', table_2, 0, to_sample=False)
                foreign_table_data = get_data(data_path + foreign_table.name + '.csv', foreign_table, 0, to_sample=False)
                
                cardinality_left = get_cardinality(table_data, table_col[1])
                cardinality_right = get_cardinality(foreign_table_data, foreign_col[1])
                
                ref_left = TableRefDTO(table_name=table_col[0], column_name=table_col[1], cardinality_type=cardinality_left)
                ref_right = TableRefDTO(table_name=foreign_col[0], column_name=foreign_col[1], cardinality_type=cardinality_right)
                dependency = DependencyDTO(ref_left, ref_right, dependency_source='Metadata')
    
                dependencies_list.append(dependency)
                
                set_col_keytype_to_FK(table, table_col[1], 'Metadata')
                set_col_keytype_to_FK(foreign_table, foreign_col[1], 'Metadata')

#                 print(dependency.left.tableName+'.'+dependency.left.columnName+'_'+dependency.left.cardinalityType+'_'+'  '+\
#                       dependency.right.tableName+'.'+dependency.right.columnName+'_'+dependency.right.cardinalityType+'  '+dependency.dependencySource)
                    
    return dependencies_list 


# Main function; extract dependencies between each possible combonation of two table from schema_discovery
def get_external_dependencies(schema_discovery, data_path, metadata=None):
    all_tables_dependencies = []
    if metadata != None:
        for table in schema_discovery.tables:
            table_metadata = metadata.tables[table.name]
            table_dep_from_metadata = add_table_dependencies_from_metadata(table, table_metadata, schema_discovery, data_path)
            all_tables_dependencies = all_tables_dependencies + table_dep_from_metadata
    
    all_possible_table_combinations = [x for x in itertools.combinations(schema_discovery.tables, 2)]

    for tables_tupple in all_possible_table_combinations:
        table_1 = tables_tupple[0]
        table_2 = tables_tupple[1]
        
        tables_dependencies = find_foreign_keys(table_1, table_2, data_path)
        all_tables_dependencies = all_tables_dependencies + tables_dependencies
      
    all_tables_dependencies = remove_duplicate_dependencies(all_tables_dependencies)
    schema_discovery.dependencies = all_tables_dependencies
    change_identifier_code_to_lable(schema_discovery)
    
get_external_dependencies(schema_discovery, data_path, metadata)

In [9]:
# print all tables names

for table in schema_discovery.tables:
  print(table.name)

In [10]:
# for each table print columns names, types, raw types, isPK and isFK

for table in schema_discovery.tables:
  print(table.name)
  for col in table.columns:
    print(col.name + ' ' +  col.type + ' ' + col.rawType + ' ' + str(col.isPK) + ' ' + str(col.isFK))
  print('')

In [11]:
# for each table print its entities

for table in schema_discovery.tables:
  print(table.name)
  print('NUMBER OF ENTITIES:', len(table.entities))
  for entity in table.entities:
    print(entity.columns)
  print('')

In [12]:
# print all dependencies between tables

for dep in schema_discovery.dependencies:
  print(dep.left.tableName  + '.' +  dep.left.columnName + '_' + dep.left.cardinalityType + '  ' + dep.right.tableName  + '.' +  dep.right.columnName + '_' + dep.right.cardinalityType)