## Lineage STEP 3: We use the following notebook to process the spreadsheet we obtained from the lineage_analysis program

### The final ODM data linage file will be stored in the following box folder: 
    
https://ibm.ent.box.com/folder/117380324021

In [None]:
import sys
sys.path.append('/odm_modules')
import pandas as pd
from common_func import odm_conn
from common_func import cloudant_conn
import re
sys.path.append('/app')
from BOX import box_oauth as box 

In [None]:
app_list  = ['odm', 'odmr']
suffix = ''

In [None]:
environ_suffix = ''   # when production usage

cloudant_db_name = 'ddl{}'.format(environ_suffix) # the cloudant database name to be ingested
cloudant_conn.cloudant_client.connect()
df_rz1 = pd.DataFrame(list(cloudant_conn.cloudant_client['rz1{}'.format(environ_suffix)]))
df_rz3 = pd.DataFrame(list(cloudant_conn.cloudant_client['rz3{}'.format(environ_suffix)]))
df_rz1

In [None]:
rz1_dict = df_rz1.loc[:, ['_id', 'table_name']].set_index('table_name').to_dict()['_id']
rz1_dict
rz3_dict = df_rz3.loc[:, ['_id', 'table_name']].set_index('table_name').to_dict()['_id']
rz1_dict.get('ODMT_EMPLOYEE')
rz_dict = {**rz1_dict, **rz3_dict}
rz_dict.get('TODM_EMPLOYEE', '')
rz_dict_converted = {k: ' ({}) '.format(v)   for k, v in rz_dict.items()}
rz_dict_converted.get('ODMT_EMPLOYEE')


### process_cols and one_round is a fixed function which to be called, don't touch this

In [None]:
def process_cols(row):
    if row.source_2 != '': 
        row.source_2, row.source = row.source, row.source_2
        row.bridge = '|'.join([row.bridge , row.bridge_2])
        row.mapping_rule = ' <==== '.join([row.mapping_rule ,row.mapping_rule_2])
    return row

def row_split(row):
    row['target_schema'] = row.target.split('.')[0]    
    row['target_tbvw'] = row.target.split('.')[1]    
    row['target_column'] = '.'.join(row.target.split('.')[2:])
    row['source_schema'] = row.source.split('.')[0]    
    row['source_tbvw'] = row.source.split('.')[1]    
    row['source_column'] = '.'.join(row.source.split('.')[2:])
    return row

### The following function is one_round process, please repeat process the function until it print 'True'

In [None]:
def one_round(df_odm):
    df_temp = df_odm.merge(df_odm, how = 'left', left_on = 'source', right_on = 'target' , suffixes = ('', '_2')).drop(columns = ['target_2', 'colno_2'])
    df_temp = df_temp.drop_duplicates().fillna('')
    df_temp = df_temp.apply(process_cols, axis = 1)
    finished = all(df_temp.source_2 == '')
    print('finished = ', finished)
    df_temp.drop(columns = ['source_2', 'mapping_rule_2', 'bridge_2'], inplace = True)
    return df_temp

# Real Process starts from here

In [None]:
def attach_c09_2_odm(df):
    print('add the contents of C09 into the result...')
    sql = '''
    WITH TB1 AS (
    SELECT CFDRSRC, CTABLE, 
    CTABNAME
    FROM 
    ODMPRD.ODMT_GI_AUTHORITY C09 LEFT OUTER JOIN
    ODMPRD.ODMT_DDICT_TABLES RZ1
    ON C09.CTABLE = RZ1.CTID
    WHERE NITEM = '*'
    )
    SELECT CFDRSRC, NAME AS NITEM, CTABLE, CTABNAME
    FROM TB1 LEFT OUTER JOIN
    SYSIBM.SYSCOLUMNS SYC
    ON 
    TB1.CTABNAME = SYC.TBNAME
    AND SYC.TBCREATOR = 'ODMPRD'
    where 
    NAME is not NULL

    UNION

    SELECT CFDRSRC, 
    NITEM
    , CTABLE, CTABNAME
    FROM 
    ODMPRD.ODMT_GI_AUTHORITY C09 LEFT OUTER JOIN
    ODMPRD.ODMT_DDICT_TABLES RZ1
    ON C09.CTABLE = RZ1.CTID
    WHERE NITEM <> '*'

    '''
    with odm_conn.odm_adhoc('prod') as odmprd_adhoc:
        result = odmprd_adhoc(sql)
    df_c09 = pd.DataFrame(result)
    df_c09.columns = ['tgt_tbvw', 'tgt_col_final', 'tid', 'src_tbvw']
    df_c09['tgt_schema'] = 'ODMPRD'
    df_c09['src_col'] = df_c09.tgt_col_final
    df_c09['src_schema'] = df_c09.tgt_schema
    df_c09['mapping_rule'] = '@TGT@ = @SRC@'
    df_c09['colno'] = 0
    df_c09 = df_c09.loc[:, ['colno', 'tgt_schema', 'tgt_tbvw', 'tgt_col_final', 'src_schema', 'src_tbvw', 'src_col', 'mapping_rule']]
    df_c09 = df_c09.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    df = df.loc[:, ['colno', 'tgt_schema', 'tgt_tbvw', 'tgt_col_final', 'src_schema', 'src_tbvw', 'src_col', 'mapping_rule']]
    df = pd.concat([df, df_c09])
    return df

In [None]:
def generate_view_table_relation(app, suffix=suffix):
    print('Processing for application {}'.format(app))
    df_odm = pd.read_csv('/result/{}_views{}.txt_parse.csv'.format(app, suffix)).fillna('')
    if app == 'odm':
        df_odm = attach_c09_2_odm(df_odm)
    df_odm['target'] = df_odm.tgt_schema + '.' + df_odm.tgt_tbvw + '.' + df_odm.tgt_col_final
    df_odm['source'] = df_odm.src_schema + '.' + df_odm.src_tbvw + '.' + df_odm.src_col
    df_odm['bridge'] = df_odm.tgt_tbvw + '<<' +   df_odm.src_tbvw
        
    df_odm.mapping_rule = df_odm.apply(lambda row: ' <== '.
                                       join([row.tgt_col_final, row.src_col]) 
                                       if row.mapping_rule == '@TGT@ = @SRC@' 
                                       else row.mapping_rule , axis = 1)
    df_odm = df_odm.loc[:, ['colno','target', 'source', 'mapping_rule' , 'bridge']]
    df = one_round(df_odm)
    df = one_round(df)
    df = one_round(df)
    df = one_round(df)
    print(df.shape)
    df = df.loc[~df.target.str.contains('____')]
    print(df.shape)
    df = df.apply(row_split, axis = 1)

    df['source_tid'] = df.source_tbvw.apply(lambda x: rz_dict.get(x, ''))
    df['target_tid'] = df.target_tbvw.apply(lambda x: rz_dict.get(x, ''))
    df = df.loc[:,['colno', 'target_schema','target_tbvw', 'target_tid','target_column', 
                   'source_schema', 'source_tbvw','source_tid', 'source_column', 'mapping_rule', 'bridge']]

    df.bridge = df.apply(lambda row: row.target_tbvw + '<<' + row.target_tbvw + '|' + row.bridge, axis = 1)
    df.bridge = df.bridge.apply(lambda item: ' <<< '.
                                join([x.split('<<')[1] +  rz_dict_converted.get(x.split('<<')[1].strip(), '')   
                                      for x in item.split('|')]))
    df.to_excel('/result/lineage/{}view_final{}.xlsx'.format(app, suffix), index = False)
    print('the process COMPLETED!\n')

In [None]:
list(map(generate_view_table_relation, app_list))

In [None]:
df = pd.read_excel('/result/lineage/odmview_final.xlsx')
#df.head()

## Disard non-important views and get ready odmview_final.xlsx

In [None]:
def discard_views(view_name):
    if view_name[:5] in ['ODMA_', 'ODMP_', 'ODME_',  'ODMH_', 'ODMUE', 'ODMUA', 'ODMUP']:
        return False
    elif re.match(r"ODMV_[VAEFD][0-9][0-9]$", view_name) or re.match(r"ODMV_[AEFD][0-9][0-9]_STAT$", view_name):
        return False
    elif re.match(r"ODMV_R[0-9A-Z][0-9A-Z]$", view_name) :
        return False
    elif re.match(r"ODMV_REF_TABLE_..$", view_name) :
        return False
    elif re.match(r"ODMC_WF360", view_name):
        return True
    elif re.match(r"ODMC_", view_name):
        return False
    else: 
        return True
    
df_final = df.loc[list(map(discard_views, df.target_tbvw))].fillna('')
df_final = df_final.loc[df_final.source_schema != 'ODMPRD_HARDCODE'].fillna('')

df_final.target_tid = df_final.apply(lambda row: row['target_tbvw'] if row['target_tid'] == ''  else row['target_tid'], axis = 1)
df_final.source_tid = df_final.apply(lambda row: row['source_tbvw'] if row['source_tid'] == ''  else row['source_tid'], axis = 1)
df_final.to_excel('/result/lineage/odmviews_final_filtered.xlsx', index = False)

## get the information from RZ8 which will be used for joining with the odmview table. 

In [None]:
with odm_conn.odm_adhoc('prod') as odmprd_adhoc: 
    result = odmprd_adhoc("select CAPPLID, CFDRSRC ,TINTFID from odmprd.ODMT_IF_INTERFACE where CINTFDIR = 'D' and CFDRSRC <> '' ")
df_rz8 = pd.DataFrame(result).applymap(lambda x: x.strip())

In [None]:
df_final_v1 = df_final.merge(df_rz8, how = 'left', left_on = 'target_tid', right_on = 'CFDRSRC').fillna('')
df_final_v1.CAPPLID = df_final_v1.apply(lambda row: 'Unknown APP(not found in RZ8)' if row['CFDRSRC'] == '' else row['CAPPLID'], axis = 1)
df_final_v1.CFDRSRC = df_final_v1.apply(lambda row: row['target_tid'] if row['CFDRSRC'] == '' else row['CFDRSRC'], axis = 1)
col_list = 'source_schema	source_tid	source_column	CAPPLID	CFDRSRC	TINTFID'.split()
col_list = ['{}'.format(col) for col in col_list]
df_final_v1 = df_final_v1[col_list]
df_final_v1['X'] = 'X'
df_final_v1.to_excel('temp1.xlsx')

In [None]:
df_final_v1['IDX'] = df_final_v1.apply(lambda row: row['source_schema'] + '|' + row['source_tid'] + '|' + row['source_column'], axis = 1 )
df_final_v1['COL'] = df_final_v1.apply(lambda row: row['CAPPLID'] + '|' + row['TINTFID'] + '|' + row['CFDRSRC'], axis = 1 )
df_final_v1.drop(columns = ['source_schema', 'source_tid', 'source_column', 'CAPPLID', 'CFDRSRC', 'TINTFID'], inplace = True)
df_final_v2 = df_final_v1.drop_duplicates()
df_final_v2 = df_final_v2.pivot(index = 'IDX', columns = 'COL', values = 'X').fillna('')
df_final_v2['Total|Total|Total'] = df_final_v2.apply(lambda row: row[row=='X'].shape[0], axis = 1)
df_final_v2.index = pd.MultiIndex.from_tuples([tuple(idx.split('|'))   for idx in df_final_v2.index], 
                                              names = ('schema', 'tid', 'column'))
df_final_v2.columns = pd.MultiIndex.from_tuples([tuple(cols.split('|'))   for cols in df_final_v2.columns])
df_final_v2.reset_index(inplace = True)

xlsx_file = 'odm data lineage.xlsx'
df_final_v2.to_excel(xlsx_file)
#df_final_v2.T.to_excel('temp2_T.xlsx')


In [None]:
xlsx_file_odmr = 'odmrview_final.xlsx'
from shutil import copyfile
copyfile("/result/lineage/odmrview_final.xlsx", xlsx_file_odmr)
# Add the filtered data for ODM
xlsx_file_odm_filtered = 'odmviews_final_filtered.xlsx'
copyfile("/result/lineage/odmviews_final_filtered.xlsx", xlsx_file_odm_filtered)

In [None]:
import openpyxl
from openpyxl.styles import Color, PatternFill, Font, Border, Alignment
from openpyxl.styles import colors
from openpyxl.cell import Cell
greenFill = PatternFill(start_color='CCCCFF11',
               end_color='CCCCFF11',
               fill_type='solid')
lightRedFill = PatternFill(start_color='00EEAAEE',
               end_color='00EEAAEE',
               fill_type='solid')
redFill = PatternFill(start_color='00EE6600',
               end_color='00EE6600',
               fill_type='solid')
colFill = PatternFill(start_color='CCCCCCFF',
               end_color='CCCCCCFF',
               fill_type='solid')
rowFill = PatternFill(start_color='AAEEEECC',
               end_color='AAEEEECC',
               fill_type='solid')

greyFill = PatternFill(start_color='CCCCCCCC',
               end_color='CCCCCCCC',
               fill_type='solid')
naFill = PatternFill(start_color='EECCFFFF',
               end_color='EECCFFFF',
               fill_type='solid')
color = ['00DDDDDD', '00CCCCCC', '00AAAAAA' ,'00888888', '00666666'  ]
colorFill = [PatternFill(start_color=c, end_color=c,fill_type='solid') for c in color]

xfile = openpyxl.load_workbook(xlsx_file)
sheet = xfile['Sheet1']
# delete the empty row
sheet.delete_rows(4,1)

    # fill color
for row in sheet.iter_rows(min_row=1, max_col=sheet.max_column, max_row=3):
    for cell in row:
        cell.fill = greenFill
for row in sheet.iter_rows(min_row=4,min_col = 1, max_col=4, max_row=sheet.max_row):
    for cell in row:
        cell.fill = lightRedFill
sheet.freeze_panes = sheet['E4'] 
            
xfile.save(xlsx_file)

In [None]:
xfile = openpyxl.load_workbook(xlsx_file_odmr)
sheet = xfile['Sheet1']
for row in sheet.iter_rows(min_row=1, max_col=sheet.max_column, max_row=1):
    for cell in row:
        cell.fill = greenFill
sheet.freeze_panes = sheet['F2'] 
            
xfile.save(xlsx_file_odmr)

In [None]:
folder_id = '117380324021'  # METATEST folder
client = box.get_box_client()
box.save2box_folder(client, folder_id, xlsx_file) 
box.save2box_folder(client, folder_id, xlsx_file_odmr)
# add the filter data for ODM
box.save2box_folder(client, folder_id, xlsx_file_odm_filtered)

## Create the summary table and posted in box folder

In [None]:
xlsx_file = 'odmrview_final.xlsx'
xlsx_file_summary = 'odmrview_final_summary.xlsx'

df_x = pd.read_excel('/result/lineage/{}'.format(xlsx_file))
df_y = df_x[['target_schema', 'target_tbvw', 'target_tid', 'source_schema', 'source_tbvw', 'source_tid']]
df_y = df_y.drop_duplicates()
df_y.to_excel('/result/lineage/{}'.format(xlsx_file_summary), index = False)
copyfile("/result/lineage/{}".format(xlsx_file_summary), xlsx_file_summary)
box.save2box_folder(client, folder_id, xlsx_file_summary) 

In [None]:
xlsx_file = 'odmviews_final_filtered.xlsx'
xlsx_file_summary = 'odmviews_final_filtered_summary.xlsx'

df_x = pd.read_excel('/result/lineage/{}'.format(xlsx_file))
df_y = df_x[['target_schema', 'target_tbvw', 'target_tid', 'source_schema', 'source_tbvw', 'source_tid']]
df_y = df_y.drop_duplicates()
df_y.to_excel('/result/lineage/{}'.format(xlsx_file_summary), index = False)
copyfile("/result/lineage/{}".format(xlsx_file_summary), xlsx_file_summary)
box.save2box_folder(client, folder_id, xlsx_file_summary) 

### The following SQL is to identify all the invalid entry in C09 table, since the TID is already sunset

In [None]:
# df_x = pd.read_excel('/result/lineage/odmviews_final_filtered.xlsx')