### Import Libraries

In [1]:
# For data manipulation
import re
import json
import pandas as pd
from pandas import json_normalize

# For NLP
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

### Data Preparation

In [3]:
with open('data.json', 'r') as f:
    data = json.load(f)

flat_data = json_normalize(data, record_path=['data']) # flatten the data

In [4]:
flat_data

Unnamed: 0,content,id,ref,type,bab,bagian,paragraf,pasal,level,context,additional_context,chunks,source_token_length,buku,alias,term
0,,,,,,,,,,,,,,,,
1,"BERITA NEGARA REPUBLIK INDONESIA No.920, 2017 ...",,,,,,,,,,,,,,,
2,PERATURAN MENTERI KEUANGAN REPUBLIK INDONESIA ...,,,,,,,,,,,,,,,
3,DENGAN RAHMAT TUHAN YANG MAHA ESA,,,,,,,,,,,,,,,
4,"MENTERI KEUANGAN REPUBLIK INDONESIA,",,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2342,SRI MULYANI INDRAWATI,,,,,,,,,,,,,,,
2343,Diundangkan di Jakarta pada tanggal 7 Juli 2017,,,,,,,,,,,,,,,
2344,DIREKTUR JENDERAL PERATURAN PERUNDANG-UNDANGAN...,,,,,,,,,,,,,,,
2345,ttd,,,,,,,,,,,,,,,


In [5]:
flat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2347 entries, 0 to 2346
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   content              2347 non-null   object 
 1   id                   1153 non-null   object 
 2   ref                  1201 non-null   object 
 3   type                 2044 non-null   object 
 4   bab                  1524 non-null   object 
 5   bagian               126 non-null    object 
 6   paragraf             0 non-null      float64
 7   pasal                1533 non-null   object 
 8   level                1153 non-null   float64
 9   context              801 non-null    object 
 10  additional_context   801 non-null    object 
 11  chunks               891 non-null    object 
 12  source_token_length  801 non-null    float64
 13  buku                 0 non-null      float64
 14  alias                90 non-null     object 
 15  term                 90 non-null     o

In [6]:
# Set up the IndoBERT NER model
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def extract_money(text):
    """
    Extracts money values from the given text.

    Args:
        text (str): The text from which to extract money values.

    Returns:
        list: A list of money values found in the text.
    """
    money_pattern = re.compile(r'Rp\s?\d{1,3}(?:\.\d{3})*(?:,\d{0,2})?\s?')
    return money_pattern.findall(text)

def extract_dates(text):
    """
    Extracts dates from the given text.

    Args:
        text (str): The text from which dates need to be extracted.

    Returns:
        list: A list of dates extracted from the text.
    """
    date_pattern = re.compile(r'\b(?:\d{1,2}\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\s\d{4})\b')
    return date_pattern.findall(text)

def extract_prohibitions(text):
    """
    Extracts the list of prohibitions from the given text.

    Args:
        text (str): The text from which to extract the prohibitions.

    Returns:
        list: A list of prohibitions extracted from the text.
    """
    prohibition_pattern = re.compile(r'\b(dilarang)\b:?\s*(\w\.\s*[^;]+;?|\w\.\s*[^;]+dan)', re.IGNORECASE | re.MULTILINE)
    prohibitions = prohibition_pattern.findall(text)
    prohibitions_list = [item[1].strip() for item in prohibitions]
    return prohibitions_list

def extract_key_terms(entity_list):
    """Extracts key terms from a list of entities for summarization.

    This function takes a list of entities and filters out the ones that belong to the 'TERM' or 'KEYWORD' entity group.
    It returns a list of the words corresponding to these key terms.

    Args:
        entity_list (list): A list of dictionaries representing entities.

    Returns:
        list: A list of words corresponding to the key terms.
    """
    return [entity['word'] for entity in entity_list if entity['entity_group'] in ('TERM', 'KEYWORD')]

def apply_ner(text, tokenizer, model):
    """Apply Named Entity Recognition (NER) on the given text.
       Updated apply_ner function to work safely with transformers pipeline
       
    Args:
        text (str): The input text to perform NER on.
        tokenizer (Tokenizer): The tokenizer used to tokenize the text.
        model (Model): The NER model used for prediction.

    Returns:
        list: A list of predicted named entities in the text.
    """
    ner_results = []
    if text.strip():  # Ensure text is not empty
        tokenized_input = tokenizer(text, truncation=True, max_length=512, padding='max_length', return_tensors="pt")
        output = model(**tokenized_input)
        ner_results = [model.config.id2label[prediction] for prediction in output.logits.argmax(dim=-1).flatten().tolist()]
        ner_results = ner_pipeline(tokenized_input)
    return ner_results

flat_data = flat_data.dropna(subset=['content'])

def process_record(record, tokenizer, model):
    """Process a record by extracting relevant information.

    Args:
        record (dict): A dictionary representing a record with 'content' as one of the keys.
        tokenizer: The tokenizer object used for tokenization.
        model: The model object used for NER (Named Entity Recognition).

    Returns:
        dict: A dictionary containing the processed record with the following keys:
            - 'content': The original content of the record.
            - 'money': A list of extracted money values.
            - 'dates': A list of extracted dates.
            - 'key_terms': A list of extracted key terms.
            - 'prohibitions': A list of extracted prohibitions.
    """
    content = record['content']

    # Check if content is empty and handle it appropriately
    if not content:
        return {
            'content': content,
            'money': [],
            'dates': [],
            'key_terms': [],
            'prohibitions': []
        }

    record['money'] = extract_money(content)
    record['dates'] = extract_dates(content)
    record['prohibitions'] = extract_prohibitions(content)
    ner_results = apply_ner(content, tokenizer, model)
    record['key_terms'] = extract_key_terms(ner_results)

    return record

# Apply the process_record function to each row in the DataFrame
processed_data = flat_data.apply(lambda x: process_record(x, tokenizer, model), axis=1)

# Display the results or continue with more 
print(processed_data[['content', 'money', 'dates', 'key_terms', 'prohibitions']])


ValueError: At least one input is required.