### Import Libraries

In [3]:
# For data manipulation
import re
import json
import pandas as pd
from pandas import json_normalize

# For NLP
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

### Data Preparation

In [4]:
with open('data.json', 'r') as f:
    data = json.load(f)

flat_data = json_normalize(data, record_path=['data']) # flatten the data

In [5]:
flat_data

Unnamed: 0,content,id,ref,type,bab,bagian,paragraf,pasal,level,context,additional_context,chunks,source_token_length,buku,alias,term
0,,,,,,,,,,,,,,,,
1,"BERITA NEGARA REPUBLIK INDONESIA No.920, 2017 ...",,,,,,,,,,,,,,,
2,PERATURAN MENTERI KEUANGAN REPUBLIK INDONESIA ...,,,,,,,,,,,,,,,
3,DENGAN RAHMAT TUHAN YANG MAHA ESA,,,,,,,,,,,,,,,
4,"MENTERI KEUANGAN REPUBLIK INDONESIA,",,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2342,SRI MULYANI INDRAWATI,,,,,,,,,,,,,,,
2343,Diundangkan di Jakarta pada tanggal 7 Juli 2017,,,,,,,,,,,,,,,
2344,DIREKTUR JENDERAL PERATURAN PERUNDANG-UNDANGAN...,,,,,,,,,,,,,,,
2345,ttd,,,,,,,,,,,,,,,


In [6]:
flat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2347 entries, 0 to 2346
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   content              2347 non-null   object 
 1   id                   1153 non-null   object 
 2   ref                  1201 non-null   object 
 3   type                 2044 non-null   object 
 4   bab                  1524 non-null   object 
 5   bagian               126 non-null    object 
 6   paragraf             0 non-null      float64
 7   pasal                1533 non-null   object 
 8   level                1153 non-null   float64
 9   context              801 non-null    object 
 10  additional_context   801 non-null    object 
 11  chunks               891 non-null    object 
 12  source_token_length  801 non-null    float64
 13  buku                 0 non-null      float64
 14  alias                90 non-null     object 
 15  term                 90 non-null     o

In [7]:
# Set up the IndoBERT NER model
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model="cahya/bert-base-indonesian-522M", tokenizer="cahya/bert-base-indonesian-522M")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at cahya/bert-base-indonesian-522M and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def extract_prohibitions(text):
    """
    Extracts the list of prohibitions from the given text.

    Args:
        text (str): The text from which to extract the prohibitions.

    Returns:
        list: A list of prohibitions extracted from the text.
    """
    prohibition_pattern = re.compile(r'\bDilarang:\s*([^;]+)', re.IGNORECASE)
    prohibitions = prohibition_pattern.findall(text)
    return prohibitions

def extract_dates(text):
    """
    Extracts dates from the given text.

    Args:
        text (str): The text from which dates need to be extracted.

    Returns:
        list: A list of dates extracted from the text.
    """
    date_pattern = re.compile(r'\b(?:\d{1,2}\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\s\d{4})\b')
    return date_pattern.findall(text)

import re

def extract_money(text):
    """
    Extracts money or monetary keywords from the given text, including multiple currencies.

    Args:
        text (str): The text from which to extract money or monetary keywords.

    Returns:
        list: A list of money or monetary keywords extracted from the text, including Rupiah, USD, and other major currencies.
    """
    patterns = {
        'RP': re.compile(r'\b(?:\(?Rp\s*(?:\d{1,3}(?:[,.]\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)\)?|\(?Rp\s*(?:nol|nol,? nol)\)?\s*\(?Rupiah\)?)\b', re.IGNORECASE),
        'USD': re.compile(r"(USD)([+-]?[0-9]{1,3}(,?[0-9]{3})*)(\.[0-9]{1,4})"),
        'EUR': re.compile(r"(€|EUR)([+-]?[0-9]{1,3}(,?[0-9]{3})*)(\.[0-9]{1,4})"),
        'GBP': re.compile(r"(£|GBP)([+-]?[0-9]{1,3}(,?[0-9]{3})*)(\.[0-9]{1,4})"),
        'JPY': re.compile(r"(¥|JPY)([+-]?[0-9]{1,3}(,?[0-9]{3})*)(\.?[0-9]{0,4})"),  # Yen typically doesn't use decimal places for common transactions
    }

    matches = []
    for currency, pattern in patterns.items():
        found_matches = pattern.findall(text)
        for match in found_matches:
            matches.append(''.join(match))

    # Convert word numbers to digits for Rupiah
    word_to_number = {
        'satu': '1', 'dua': '2', 'tiga': '3', 'empat': '4', 'lima': '5',
        'enam': '6', 'tujuh': '7', 'delapan': '8', 'sembilan': '9', 'nol': '0',
        'ribu': '000', 'juta': '000000', 'miliar': '000000000', 'triliun': '000000000000'
    }

    numerical_matches = []
    for match in matches:
        for word, value in word_to_number.items():
            if 'Rp' in match:  # Apply word-to-number conversion only for Rupiah
                match = match.replace(word, value)
        numerical_matches.append(match)

    return numerical_matches


def apply_ner(text, ner_pipeline, max_length=512):
    """Apply Named Entity Recognition (NER) using the transformers pipeline on the given text.
    This version supports processing of text longer than 512 tokens by splitting the text into
    manageable parts and then combining the results.

    Args:
        text (str): The input text to perform NER on.
        ner_pipeline (pipeline): The NER pipeline for prediction.
        max_length (int): Maximum length of tokens to process in a single call to the NER pipeline.

    Returns:
        list: A list of predicted named entities in the text.
    """
    if not text.strip():  # Check if text is empty
        return []

    # Initialize variables
    split_texts = [text[i:i+max_length] for i in range(0, len(text), max_length)]
    ner_results = []

    # Process each split text
    for split_text in split_texts:
        results = ner_pipeline(split_text)
        ner_results.extend(results)

    return ner_results


flat_data = flat_data.dropna(subset=['content'])

def process_record(record, ner_pipeline):
    """Process a record by extracting relevant information.

    Args:
        record (pd.Series): A pandas Series representing a record with 'content' as one of the keys.
        ner_pipeline: The NER pipeline object used for Named Entity Recognition.

    Returns:
        dict: A dictionary containing the processed record with the following keys:
            - 'content': The original content of the record.
            - 'money': A list of extracted money values.
            - 'dates': A list of extracted dates.
            - 'prohibitions': A list of extracted prohibitions.
            - 'named_entities': A list of named entities recognized by NER.
    """
    content = record['content']

    if not content:
        return {
            'content': content,
            'money': [],
            'dates': [],
            'prohibitions': [],
            'named_entities': []
        }

    result_dict = {
        'content': content,
        'money': extract_money(content),
        'dates': extract_dates(content),
        'prohibitions': extract_prohibitions(content),
    }
    ner_results = apply_ner(content, ner_pipeline)

    return result_dict



In [14]:
# Ensure you have a DataFrame 'flat_data' with a 'content' column available.
# Create a new DataFrame to store the processed records
processed_records = [process_record(row, ner_pipeline) for index, row in flat_data.iterrows()]
processed_df = pd.DataFrame(processed_records)

# Now `processed_df` contains the original content and the extracted information.
print(processed_df)  # To inspect the first few records.

                                                content money          dates  \
0                                                          []             []   
1     BERITA NEGARA REPUBLIK INDONESIA No.920, 2017 ...    []             []   
2     PERATURAN MENTERI KEUANGAN REPUBLIK INDONESIA ...    []             []   
3                     DENGAN RAHMAT TUHAN YANG MAHA ESA    []             []   
4                  MENTERI KEUANGAN REPUBLIK INDONESIA,    []             []   
...                                                 ...   ...            ...   
2342                              SRI MULYANI INDRAWATI    []             []   
2343    Diundangkan di Jakarta pada tanggal 7 Juli 2017    []  [7 Juli 2017]   
2344  DIREKTUR JENDERAL PERATURAN PERUNDANG-UNDANGAN...    []             []   
2345                                                ttd    []             []   
2346                                 WIDODO EKATJAHJANA    []             []   

     prohibitions named_entities  
0   

In [16]:
# #drop named_entities column
# processed_df = processed_df.drop(columns='named_entities')
for col in processed_df.columns:
    processed_df[col] = processed_df[col].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [18]:
#unique column money, dates, prohibitions and print all of them with for loop
for col in ['money', 'dates', 'prohibitions']:
    print(f'Unique values in {col} column:')
    print(processed_df[col].unique())
    print()

Unique values in money column:
['' 'Rp0, Rp0']

Unique values in dates column:
['' '8 Juni 2017' '5 Juli 2017' '7 Juli 2017' '16 Januari 2023'
 '17 Januari 2023' '23 April 2019' '18 Oktober 2019']

Unique values in prohibitions column:
[''
 'a. menggunakan kewenangan yang dimiliki untuk kepentingan pribadi, keluarga, dan/atau golongan']



In [19]:
processed_df.to_csv('processed_data.csv', index=False)