<b>Sim Models Metadata Notebook</b><br>
 - JSONL file to be read line-by-line for each file case.

<b>Libraries</b>

In [2]:
import json
import ast
import string
import spacy
import pandas as pd
import os
print( os.getcwd() )


c:\Users\jchown\projects\metadata_cleanup


<b>Global Variables</b>

In [3]:
trained_nlp = spacy.load("./mpn_nlp_model")


<b>Read JSONL file contents into list</b>

In [4]:
with open('sim_models_api_feed.json', 'rt') as file_handle:
    file_data = json.loads( file_handle.read() )

metadata_case_list = file_data['hits']


In [5]:
metadata_case_list[0]

{'jcr:path': '/content/dam/micron/global/secure/products/sim-model/modules/ddr5/mtc20f2085s1rc64bd1-thermal.zip',
 'jcr:primaryType': 'dam:Asset',
 'jcr:mixinTypes': ['mix:referenceable'],
 'jcr:createdBy': 'sdanny@micron.com',
 'jcr:created': 'Fri Jan 26 2024 20:30:22 GMT+0000',
 'jcr:uuid': '432ca891-da31-4c4a-b600-80c3f6974162',
 'jcr:content': {'jcr:primaryType': 'dam:AssetContent',
  'jcr:mixinTypes': ['cq:ReplicationStatus'],
  'mt:documentID': '432ca891-da31-4c4a-b600-80c3f6974162',
  'cq:lastReplicationAction': 'Activate',
  'mt:documentSupport': 'web_support',
  'migratedFrom': 'micron-aws-s3-aem:prod/sites-dam/micron/global/secure-latest/products/sim-model/modules/ddr5/mtc20f2085s1rc64bd1-thermal.zip',
  'cq:lastReplicatedBy': 'workflow-process-service',
  'fmUuid': 'GUID-acedaa40-b2b5-45f7-b524-999fd31a8f7e',
  'jcr:lastModifiedBy': 'sdanny@micron.com',
  'mt:securityLevel': '5',
  'cq:lastReplicated': 'Fri Aug 09 2024 09:10:38 GMT+0000',
  'names': 'mtc20f2085s1rc64bd1-ther

<b>Remove non-printable characters</b>

In [6]:
def remove_non_printable(text: str):
    return ''.join( filter(str.isprintable, str(text)) )


<b>Simplify text removing punctuation and reducing whitespace</b>

In [7]:
def simplify_text(text: str) -> str:
    if text == 'none':
        return ''
    simplified_text = str(text).translate( str.maketrans( dict.fromkeys(string.punctuation, ' ') ) )
    return ' '.join( [ text_item for text_item in simplified_text.split() if len(text_item) > 0 ] )


<b>Parse the subject field into a list of subject strings</b>

In [8]:
def parse_subject(text: str) -> str:
    try:
        text_list = ast.literal_eval(text)
    
    except Exception as _e:
        text_list = [text]
    
    output_list = list()
    for text_item in text_list:
        if text_item == 'none' or len(text_item) == 0:
            continue
        clean_text_item = simplify_text(text_item)
        output_list += [ text for text in clean_text_item.split() if len(text) > 0 ]

    unique_output_list_ordered = list()
    for text_item in output_list:
        if text_item not in unique_output_list_ordered:
            unique_output_list_ordered.append(text_item)
    
    return ' '.join(unique_output_list_ordered)


<b>Parse the products field into a list of product strings</b>

In [9]:
def parse_products(text: str) -> list:
    try:
        text_list = ast.literal_eval(text)
    
    except Exception as _e:
        text_list = [text]
    
    output_list = list()
    for text_item in text_list:
        if text_item == 'none' or len(text_item) == 0:
            continue
        partial_text_item = ' '.join( text_item.split('/')[-2:] )
        clean_text_item = simplify_text(partial_text_item)
        output_list += [ text for text in clean_text_item.split() if len(text) > 0 ]

    unique_product_list = list()
    for text_item in output_list:
        if text_item not in unique_product_list:
            unique_product_list.append(text_item)

    return unique_product_list


<b>Parse the doc category for AEM docs</b>

In [10]:
def parse_aem_doc_category(text: str, url: str) -> str:
    
    if 'data sheet' in text or 'data-sheet' in url:
        return 'data sheet'
    
    if 'technical note' in text or 'technical-note' in url:
        return 'tech note'
    
    return 'other'


<b>Preprocessor for NER</b>

In [11]:
def preprocess_ner_text(text: str) -> str:
    output_text = remove_non_printable( text.lower() )
    output_text = simplify_text(output_text)
    output_text = output_text.replace('–', ' ').replace('\r\n', '').strip()
    return output_text


<b>Postprocessor for NER</b>

In [12]:
def postprocess_ner_extracts(ner_list: list) -> list:
    if len(ner_list) < 1:
        return list()
    
    output_list = list()
    for list_item in ner_list:
        output_list += list_item.split()
    
    return output_list


<b>NER for MPN</b>

In [13]:
def ner_mpn(input_data) -> list:
    if isinstance(input_data, str):
        input_data_list = [input_data]
    elif isinstance(input_data, list):
        input_data_list = input_data
    else:
        raise ValueError("input_data argument must be str or list type!")

    mpn_list = list()
    for text in input_data_list:
        for sub_text in str(text).split():
            if len(sub_text) > 0:
                doc = trained_nlp( str(sub_text) )
                mpn_list += [ str(mpn.text).strip() for mpn in doc.ents if doc.ents and "MPN" in mpn.label_ and len(mpn.text) > 4 and mpn.text != "manufacturer" ]
    
    return mpn_list


<b>Rule-Based NER</b>

In [14]:
def rule_based_ner(input_str: str, string_rules: list) -> list:
    output_list = list()
    for string_rule in string_rules:
        if str(string_rule).lower() in str(input_str).lower():
            output_list.append( str(string_rule).lower() )
    return output_list


In [28]:
def rule_based_ner_data_source(input_str_dict: dict, string_rules: list) -> str:

    output_list = list()

    # MT Content Type
    split_mt_content_type = input_str_dict['mt_content_type'].split('/')
    if split_mt_content_type:
        for string_rule in string_rules:
            if string_rule in split_mt_content_type:
                output_list.append( str(string_rule).lower() )
    
    # Thermal type is also in the jcr:path.
    if 'thermal' in input_str_dict['jcr_path']:
        output_list.append('thermal')
    
    unique_output_list = list( set(output_list) )
    return ' '.join(unique_output_list)


In [27]:
rules_df = pd.read_excel('entity_groups.xlsx')
BU_NER_RULES = ['ebu', 'sbu', 'cnbu', 'mbu', 'cpg']
FUNCTIONAL_TECHNOLOGY_RULES = rules_df.loc[~pd.isna(rules_df['Functional Technology']), 'Functional Technology'].to_list()
MARKET_SUBSEGMENT_RULES = rules_df.loc[~pd.isna(rules_df['Segments']), 'Segments'].to_list()
PACKAGE_TYPE_RULES = rules_df.loc[~pd.isna(rules_df['Package Type']), 'Package Type'].to_list()
DENSITY_RULES = rules_df.loc[~pd.isna(rules_df['Density']), 'Density'].to_list()
PRODUCT_SERIES_RULES = rules_df.loc[~pd.isna(rules_df['Product Series']), 'Product Series'].to_list()
PROCESS_REQUIREMENT_RULES = rules_df.loc[~pd.isna(rules_df['Process Requirements']), 'Process Requirements'].to_list()
DESIGN_ID_RULES = rules_df.loc[~pd.isna(rules_df['Design ID']), 'Design ID'].to_list()
DOC_CATEGORY_RULES = ['ibis', 'verilog', 'hspice', 'spice', 's-parameter', 'simulation-model', 'hyperlynx', 'ibis-ami', 'system-verilog', 'thermal', 'sisoft',]


<b>Build NER Data</b>

In [16]:
def build_ner_data(text_list: list, method: str) -> list:
    if method == 'mpn':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( ner_mpn( preprocess_ner_text(text_item) ) )
        return list( set(output_list) )

    elif method == 'bu':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( rule_based_ner(preprocess_ner_text(text_item), BU_NER_RULES) )
        return list( set(output_list) )
    
    elif method == 'functional_technology':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( rule_based_ner(preprocess_ner_text(text_item), FUNCTIONAL_TECHNOLOGY_RULES) )
        return list( set(output_list) )

    elif method == 'market_subsegment':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( rule_based_ner(preprocess_ner_text(text_item), MARKET_SUBSEGMENT_RULES) )
        return list( set(output_list) )

    elif method == 'package_type':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( rule_based_ner(preprocess_ner_text(text_item), PACKAGE_TYPE_RULES) )
        return list( set(output_list) )

    elif method == 'product_series':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( rule_based_ner(preprocess_ner_text(text_item), PRODUCT_SERIES_RULES) )
        return list( set(output_list) )

    elif method == 'process_requirement':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( rule_based_ner(preprocess_ner_text(text_item), PROCESS_REQUIREMENT_RULES) )
        return list( set(output_list) )

    elif method == 'design_id':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( rule_based_ner(preprocess_ner_text(text_item), DESIGN_ID_RULES) )
        return list( set(output_list) )

    elif method == 'density':
        output_list = list()
        for text_item in text_list:
            output_list += postprocess_ner_extracts( rule_based_ner(preprocess_ner_text(text_item), DENSITY_RULES) )
        return list( set(output_list) )

    raise ValueError("Method of NER extraction incorrect! Must be mpn, bu, functional_technology, market_subsegment, package_type, product_series, process_requirement, or design_id!")


<b>TEST Metadata Cleanup</b><br>
<code>},
    {
      "jcr:path": "/content/dam/micron/global/secure/products/sim-model/modules/ddr4/mta18asf2g72hz-3g2b1-ebd.zip",
       ],
      "jcr:created": "Fri Jan 26 2024 20:29:06 GMT+0000",
      "jcr:uuid": "1fe3ff3f-3cdc-4bf6-ad24-d07a34c06b1e",
        "mt:documentID": "1fe3ff3f-3cdc-4bf6-ad24-d07a34c06b1e",
        "mt:documentSupport": "dram_support",
        "names": "mta18asf2g72hz-3g2b1-ebd.zip",
        "jcr:lastModified": "Mon Dec 16 2024 18:32:50 GMT+0000",
        "names-2": "mta18asf2g72hz-3g2b1-ebd",
        metadata":
        "mt:rba": [
          "cnbu"
          ],
          "dc:description": "MTA18ASF2G72HZ-3G2B1",
          "mt:contentType": "micron:document type-categories-subcategories/sim-model/ibis",
          "dam:Content": "MTA18ASF2G72HZ-3G2B1_ebd/clk_termination.JPG\nMTA18ASF2G72HZ-3G2B1_ebd/MTA18ASF2G72HZ-3G2B1.ebd\nMTA18ASF2G72HZ-3G2B1_ebd/MTA18ASF2G72HZ-3G2B1.ibs\nMTA18ASF2G72HZ-3G2B1_ebd/MTA18ASF2G72HZ-3G2B1_ebd_readme.txt\nMTA18ASF2G72HZ-3G2B1_ebd/resistor_d.ibs\nMTA18ASF2G72HZ-3G2B1_ebd/z22a.ibs\nMTA18ASF2G72HZ-3G2B1_ebd/z22a_ext_model_quality_rpt.pdf\n",
          "dc:title": "DDR4 IBIS EBD Model"
</code>          

In [17]:
test_case = metadata_case_list[0]
test_case.keys()

dict_keys(['jcr:path', 'jcr:primaryType', 'jcr:mixinTypes', 'jcr:createdBy', 'jcr:created', 'jcr:uuid', 'jcr:content'])

In [24]:
print('JCR PATH: ', remove_non_printable( test_case.get('jcr:path', '') ).lower() )
print('DC:DESCRIPTION: ', test_case.get('jcr:content', {}).get('metadata', {}).get('dc:description', ''))
print('MT:CONTENTTYPE: ', test_case.get('jcr:content', {}).get('metadata', {}).get('mt:contentType', [''])[0])
print('DC:TITLE: ', test_case.get('jcr:content', {}).get('metadata', {}).get('dc:title', ''))


JCR PATH:  /content/dam/micron/global/secure/products/sim-model/modules/ddr5/mtc20f2085s1rc64bd1-thermal.zip
DC:DESCRIPTION:  
MT:CONTENTTYPE:  micron:document type-categories-subcategories/sim-model
DC:TITLE:  


In [None]:
case_number = 0
test_clean_metadata_list = list()
for test_case in metadata_case_list:
    case_number += 1
    print( f"Working on case {case_number} of {len(metadata_case_list)}")
    test_clean_case = dict()

    # Baseline keys.
    test_clean_case['id'] = test_case.get('jcr:uuid', '')
    test_clean_case['content'] = "DE TEAM TO POPULATE!!!" # GCS path + MIME

    # jsonData construction.
    test_clean_case_jsonData = dict()

    # Core metadata fields.
    test_clean_case_jsonData['doc_type'] = 'sim-model'
    test_clean_case_jsonData['file_name'] = remove_non_printable( test_case.get('jcr:path', '') ).split('/')[-1].replace('_', '-').replace('/', '-').replace('\r\n', '')
    test_clean_case_jsonData['url'] = 'https://www.micron.com' + remove_non_printable( test_case.get('jcr:path', '') )
    test_clean_case_jsonData['pillar'] = "sales"
    test_clean_case_jsonData['language'] = "{'English': 99.0}"
    test_clean_case_jsonData['deleted'] = "no"
    test_clean_case_jsonData['author'] = remove_non_printable( str( test_case.get('jcr:createdBy', '') ).replace("['", '').replace("']", '') )
    test_clean_case_jsonData['last_modified_time'] = remove_non_printable( test_case.get('jcr:created', '') )
    test_clean_case_jsonData['last_modified_unix_time'] = "DE TEAM TO POPULATE!!!"
    test_clean_case_jsonData['title'] = remove_non_printable( test_case.get('jcr:content', {}).get('names', '') ).replace('_', '-').replace('/', '-').replace('\r\n', '')
    test_clean_case_jsonData['file_type'] = "DE TEAM TO POPULATE!!!"
    test_clean_case_jsonData['doc_id'] = remove_non_printable( test_case.get('jcr:uuid', '') ) # DE TEAM MAY NEED TO POPULATE!!!

    # Doc IDs for DPC - DE TEAM TO VALIDATE AND POPULATE WITH CORRECT INFORMATION!!!
    test_clean_case_jsonData['aem_object_id'] = remove_non_printable( test_case.get('jcr:uuid', '') )
    test_clean_case_jsonData['aem_uuid'] = remove_non_printable( test_case.get('jcr:uuid', '') )

    # Doc category    
    test_clean_case_jsonData['doc_category'] = rule_based_ner_data_source(
        {
            'mt_content_type': test_case.get('jcr:content', {}).get('metadata', {}).get('mt:contentType', [''])[0],
            'jcr_path': remove_non_printable( test_case.get('jcr:path', '') ).lower(),
        },
        DOC_CATEGORY_RULES,
    )
    # parse_aem_doc_category(
    #     simplify_text( remove_non_printable( test_case['jsonData'].get('doc_category', '') ).lower().replace(',', ' ').replace('-', ' ').replace('–', ' ').strip() ),
    #     test_case['jsonData']['url'],
    # )

    # Optional AEM metadata fields.
    test_clean_case_jsonData['description'] = ' '.join([
        simplify_text( remove_non_printable( test_case.get('jcr:content', {}).get('metadata', {}).get('dc:description', '') ).lower().replace(',', ' ').replace('-', ' ').replace('–', ' ').strip() ),
    ])
    test_clean_case_jsonData['subject'] = ' '.join([
        simplify_text( remove_non_printable( test_case.get('jcr:content', {}).get('metadata', {}).get('dc:title', '') ).lower().replace(',', ' ').replace('-', ' ').replace('–', ' ').strip() ),
        simplify_text( remove_non_printable( test_case.get('jcr:content', {}).get('metadata', {}).get('dam:Content', '')[:100] ).lower().replace(',', ' ').replace('-', ' ').replace('–', ' ').strip() ),
    ])
    test_clean_case_jsonData['products'] = simplify_text( remove_non_printable( test_case.get('jcr:content', {}).get('metadata', {}).get('dam:Content', '')[100:200] ).lower().replace(',', ' ').replace('-', ' ').replace('–', ' ').strip() )
    test_clean_case_jsonData['optional_text_field_1'] = ' '.join(['',])
    test_clean_case_jsonData['optional_text_field_2'] = ' '.join(['',])
    test_clean_case_jsonData['optional_text_field_3'] = ' '.join(['',])
    test_clean_case_jsonData['optional_text_field_4'] = ' '.join(['',])
    test_clean_case_jsonData['optional_text_field_5'] = ' '.join(['',])

    # NER metadata.
    test_clean_case_jsonData['business_unit_list'] = build_ner_data(test_case.get('jcr:content', {}).get('metadata', {}).get('mt:rba', ''), 'bu')
    test_clean_case_jsonData['market_subsegment_list'] = build_ner_data(
        [
            test_clean_case_jsonData['file_name'],
            test_clean_case_jsonData['url'],
            test_clean_case_jsonData['description'],
            test_clean_case_jsonData['subject'],
            test_clean_case_jsonData['products'],
            test_clean_case_jsonData['optional_text_field_1'],
            test_clean_case_jsonData['optional_text_field_2'],
            test_clean_case_jsonData['optional_text_field_3'],
            test_clean_case_jsonData['optional_text_field_4'],
            test_clean_case_jsonData['optional_text_field_5'],
        ],
        'market_subsegment',
    )
    test_clean_case_jsonData['mpn_list'] = build_ner_data(
        [
            test_clean_case_jsonData['file_name'],
            test_clean_case_jsonData['url'],
            test_clean_case_jsonData['description'],
            test_clean_case_jsonData['subject'],
            test_clean_case_jsonData['products'],
            test_clean_case_jsonData['optional_text_field_1'],
            test_clean_case_jsonData['optional_text_field_2'],
            test_clean_case_jsonData['optional_text_field_3'],
            test_clean_case_jsonData['optional_text_field_4'],
            test_clean_case_jsonData['optional_text_field_5'],
        ],
        'mpn',
    )
    test_clean_case_jsonData['design_id_list'] = build_ner_data(
        [
            test_clean_case_jsonData['file_name'],
            test_clean_case_jsonData['url'],
            test_clean_case_jsonData['description'],
            test_clean_case_jsonData['subject'],
            test_clean_case_jsonData['products'],
            test_clean_case_jsonData['optional_text_field_1'],
            test_clean_case_jsonData['optional_text_field_2'],
            test_clean_case_jsonData['optional_text_field_3'],
            test_clean_case_jsonData['optional_text_field_4'],
            test_clean_case_jsonData['optional_text_field_5'],
        ],
        'design_id',
    )
    test_clean_case_jsonData['functional_technology_list'] = build_ner_data(
        [
            test_clean_case_jsonData['file_name'],
            test_clean_case_jsonData['url'],
            test_clean_case_jsonData['description'],
            test_clean_case_jsonData['subject'],
            test_clean_case_jsonData['products'],
            test_clean_case_jsonData['optional_text_field_1'],
            test_clean_case_jsonData['optional_text_field_2'],
            test_clean_case_jsonData['optional_text_field_3'],
            test_clean_case_jsonData['optional_text_field_4'],
            test_clean_case_jsonData['optional_text_field_5'],
        ],
        'functional_technology',
    )
    test_clean_case_jsonData['package_type_list'] = build_ner_data(
        [
            test_clean_case_jsonData['file_name'],
            test_clean_case_jsonData['url'],
            test_clean_case_jsonData['description'],
            test_clean_case_jsonData['subject'],
            test_clean_case_jsonData['products'],
            test_clean_case_jsonData['optional_text_field_1'],
            test_clean_case_jsonData['optional_text_field_2'],
            test_clean_case_jsonData['optional_text_field_3'],
            test_clean_case_jsonData['optional_text_field_4'],
            test_clean_case_jsonData['optional_text_field_5'],
        ],
        'package_type',
    )
    test_clean_case_jsonData['density_list'] = build_ner_data(
        [
            test_clean_case_jsonData['file_name'],
            test_clean_case_jsonData['url'],
            test_clean_case_jsonData['description'],
            test_clean_case_jsonData['subject'],
            test_clean_case_jsonData['products'],
            test_clean_case_jsonData['optional_text_field_1'],
            test_clean_case_jsonData['optional_text_field_2'],
            test_clean_case_jsonData['optional_text_field_3'],
            test_clean_case_jsonData['optional_text_field_4'],
            test_clean_case_jsonData['optional_text_field_5'],
        ],
        'density',
    )
    test_clean_case_jsonData['product_series_list'] = build_ner_data(
        [
            test_clean_case_jsonData['file_name'],
            test_clean_case_jsonData['url'],
            test_clean_case_jsonData['description'],
            test_clean_case_jsonData['subject'],
            test_clean_case_jsonData['products'],
            test_clean_case_jsonData['optional_text_field_1'],
            test_clean_case_jsonData['optional_text_field_2'],
            test_clean_case_jsonData['optional_text_field_3'],
            test_clean_case_jsonData['optional_text_field_4'],
            test_clean_case_jsonData['optional_text_field_5'],
        ],
        'product_series',
    )
    test_clean_case_jsonData['process_requirement_list'] = build_ner_data(
        [
            test_clean_case_jsonData['file_name'],
            test_clean_case_jsonData['url'],
            test_clean_case_jsonData['description'],
            test_clean_case_jsonData['subject'],
            test_clean_case_jsonData['products'],
            test_clean_case_jsonData['optional_text_field_1'],
            test_clean_case_jsonData['optional_text_field_2'],
            test_clean_case_jsonData['optional_text_field_3'],
            test_clean_case_jsonData['optional_text_field_4'],
            test_clean_case_jsonData['optional_text_field_5'],
        ],
        'process_requirement',
    )

    test_clean_case['jsonData'] = test_clean_case_jsonData
    
    test_clean_metadata_list.append(test_clean_case)


Working on case 1 of 1093
Working on case 2 of 1093
Working on case 3 of 1093
Working on case 4 of 1093
Working on case 5 of 1093
Working on case 6 of 1093
Working on case 7 of 1093
Working on case 8 of 1093
Working on case 9 of 1093
Working on case 10 of 1093
Working on case 11 of 1093
Working on case 12 of 1093
Working on case 13 of 1093
Working on case 14 of 1093
Working on case 15 of 1093
Working on case 16 of 1093
Working on case 17 of 1093
Working on case 18 of 1093
Working on case 19 of 1093
Working on case 20 of 1093
Working on case 21 of 1093
Working on case 22 of 1093
Working on case 23 of 1093
Working on case 24 of 1093
Working on case 25 of 1093
Working on case 26 of 1093
Working on case 27 of 1093
Working on case 28 of 1093
Working on case 29 of 1093
Working on case 30 of 1093
Working on case 31 of 1093
Working on case 32 of 1093
Working on case 33 of 1093
Working on case 34 of 1093
Working on case 35 of 1093
Working on case 36 of 1093
Working on case 37 of 1093
Working on

In [33]:
list( set([ clean_test_case['jsonData']['doc_category'] for clean_test_case in test_clean_metadata_list ]) )


['',
 'hspice',
 'ibis',
 'spice',
 'thermal',
 'system-verilog',
 's-parameter',
 'ibis-ami',
 'verilog']