# # Run run_ner_indobert.ipynb
Credits:
Cleaned train and test data from:
https://www.kaggle.com/zeyalt/scl-2021-data-science-part-1-data-cleaning/comments?select=cleaned_test.csv

In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from functools import partial
tqdm = partial(tqdm, position=0, leave=True)

In [None]:
%%capture
# Download transformers  and install required packages.
!git clone https://github.com/huggingface/transformers
%cd transformers
!pip install .
!pip install -r ./examples/_tests_requirements.txt 
%cd ..

# Data

Add data from:
https://www.kaggle.com/numerator/scl2021data

In [None]:
train_df = pd.read_csv('/kaggle/input/scl2021data/cleaned_train.csv').fillna('')
train_df.head()

In [None]:
test_df = pd.read_csv('/kaggle/input/scl2021data/cleaned_test.csv').fillna('')
test_df.head()

In [None]:
# train dev split - 90:10
import numpy as np
#train, dev, test =  np.split(train_df.sample(frac=1, random_state=42), 
#                       [int(.6*len(train_df)), int(.8*len(train_df))])
train, dev =  np.split(train_df.sample(frac=1, random_state=42), 
                       [int(.9*len(train_df))])

In [None]:
dev.to_csv('dev.csv', index=False)

In [None]:
# Process raw data by assigning entity POI or ST or O to each word
def preprocess_train_data(raw_df, output_name):
    with open(output_name, 'a') as text_file:
        for index, row in tqdm(raw_df.iterrows()):
            address = row['raw_address'].replace(",", "").split()
            if row['POI'] == '':
                poi = ''
            else:
                poi = row['POI'].split()
            if row['street'] == '':
                st = ''
            else:
                street = row['street'].split()
            for address_word in address:
                if any(address_word in p for p in poi):
                    text_file.write(address_word + ' POI \n')  
                elif any(address_word in s for s in street):
                    text_file.write(address_word + ' ST \n') 
                else:
                    text_file.write(address_word + ' O \n')  

preprocess_train_data(train, 'train_temp.txt')
preprocess_train_data(dev, 'dev_temp.txt')

In [None]:
train_temp_txt = pd.read_table('train_temp.txt')
print(train_temp_txt)

In [None]:
# Process raw data by assigning entity POI or ST or O to each word
def preprocess_test_data(raw_df, output_name):
    start_counter = 1
    end_counter = 0
    raw_df['start'] = start_counter
    raw_df['end'] = end_counter
    with open(output_name, 'a') as text_file:
        for index, row in tqdm(raw_df.iterrows()):
            raw_df.loc[index, 'start'] = start_counter
            address = row['raw_address'].replace(",", "").split()
            for address_word in address:
                text_file.write(address_word + '\n')
                start_counter += 1
                end_counter += 1
            raw_df.loc[index, 'end'] = end_counter
    return raw_df

test_processed_df = preprocess_test_data(test_df, 'test_temp.txt')
test_processed_df.head()


# Preprocess data

In [None]:
%%time
# Set parameters
MAX_LENGTH = 128 #@param {type: "integer"}
MODEL = "indobenchmark/indobert-lite-base-p1" #@param ["chriskhanhtran/spanberta", "bert-base-multilingual-cased", "indobenchmark/indobert-lite-base-p1", "indobenchmark/indobert-large-p1"]
PATH = "/kaggle/input/scl2021-src/"
!python3 $PATH/preprocess.py train_temp.txt $MODEL $MAX_LENGTH > train.txt
!python3 $PATH/preprocess.py dev_temp.txt $MODEL $MAX_LENGTH > dev.txt
!python3 $PATH/preprocess.py test_temp.txt $MODEL $MAX_LENGTH > test.txt
# Generate labels.txt
!cat train.txt dev.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
labels_txt = pd.read_table('labels.txt')
print(labels_txt)
train_txt = pd.read_table('train.txt')
print(train_txt)

# Fine-tuning Model

In [None]:
%%capture
# install required packages
!pip install seqeval
!pip install datasets

In [None]:
%%time
# Set training parameters
MAX_LENGTH = 128 #@param {type: "integer"}
MODEL = "indobenchmark/indobert-large-p1" #@param ["chriskhanhtran/spanberta", "bert-base-multilingual-cased", "indobenchmark/indobert-lite-base-p1", "indobenchmark/indobert-large-p1"]
OUTPUT_DIR = "/kaggle/working/indobert-ner" #@param ["spanberta-ner", "bert-base-ml-ner", "indobert-ner", "drive/MyDrive/Shopee"]
BATCH_SIZE = 16 #@param {type: "integer"}
NUM_EPOCHS = 3 #@param {type: "integer"}
SAVE_STEPS = 2000 #@param {type: "integer"}
LOGGING_STEPS = 1000 #@param {type: "integer"}
SEED = 42 #@param {type: "integer"}

!python3 $PATH/run_ner.py \
  --data_dir ./ \
  --model_type bert \
  --labels ./labels.txt \
  --model_name_or_path $MODEL \
  --output_dir $OUTPUT_DIR \
  --max_seq_length  $MAX_LENGTH \
  --num_train_epochs $NUM_EPOCHS \
  --per_gpu_train_batch_size $BATCH_SIZE \
  --save_steps $SAVE_STEPS \
  --logging_steps $LOGGING_STEPS \
  --seed $SEED \
  --do_train \
  --do_eval \
  --do_predict \
  --overwrite_output_dir


# Process output to submission format

In [None]:
test_predictions = pd.read_table('indobert-ner/test_predictions.txt')
print(test_predictions)

test_predictions = pd.read_table('indobert-ner/test_predictions.txt')
print(test_predictions)

test_predictions = pd.read_table('indobert-ner/test_predictions.txt')
print(test_predictions)

def extract_word(file, row_start, row_end):
    '''
    format model prediction output in submission format
    '''
    poi = ''
    st = ''
    f=open(file)
    lines = f.readlines()
    i = row_start
    while i >= row_start and i <= row_end:
        if len(lines[i]) > 1 :
            word = lines[i].split()[0]  
            tag = lines[i].split()[1]
        else:
            tag = 'O'
        if tag == 'POI':
            poi = poi + ' ' + word
        elif tag == 'ST':
            st = st + ' ' + word
        i += 1
    return poi.strip() + '/' + st.strip()

test_processed_df['POI/street']=''
printcounter = 0
for i in tqdm(range(0, len(test_processed_df))):
    row_start = test_processed_df.loc[i, 'start']
    row_end = test_processed_df.loc[i, 'end']
    test_processed_df.loc[i, 'POI/street'] = extract_word('indobert-ner/test_predictions.txt', row_start, row_end)
    # add checkpoints
    if (printcounter == 1000):
        test_processed_df[['id','POI/street']].to_csv('submit.csv', index=False)
        printcounter = 0
        printcounter += 1
test_processed_df.head()