In [None]:
import argparse
import time
import numpy as np
import pandas as pd
import os
import gc
import math
import multiprocessing
import io
import logging 
import itertools
import shutil
import pysnooper
import warnings
import glob
import pendulum
import json
import sys
import subprocess
import matplotlib
import matplotlib.pyplot as plt
import scikitplot as skplt
import seaborn as sns
from icecream import ic
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm_notebook as tqdm
from pathlib import Path
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

matplotlib.use('Agg')
warnings.filterwarnings("ignore")

seed = 9527
np.set_printoptions(suppress=True)

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.WARNING)


In [None]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--input_file', default=None, help="Input raw text file (or comma-separated list of files).")
parser.add_argument('--output_file', default=None, help="Output TF example file (or comma-separated list of files).")
parser.add_argument('--vocab_file', default=None, help="The vocabulary file that the ALBERT model was trained on.")
parser.add_argument('--spm_model_file', default=None, help="The model file for sentence piece tokenization.")
parser.add_argument('--input_file_mode', default="r",  help="The data format of the input file.")
parser.add_argument('--do_lower_case', default=True, help="Whether to lower case the input text. Should be True for uncased models and False for cased models.")
parser.add_argument('--do_whole_word_mask', default=True, help="Whether to use whole word masking rather than per-WordPiece masking.")
parser.add_argument('--do_permutation', default=False, help="Whether to do the permutation training.")
parser.add_argument('--favor_shorter_ngram', default=True, help="Whether to set higher probabilities for sampling shorter ngrams.")
parser.add_argument('--random_next_sentence', default=False, help="Whether to use the sentence that's right before the current sentence "
                    "as the negative sample for next sentence prection, rather than using "
                    "sentences from other random documents.")
parser.add_argument('--max_seq_length', default=512, help="Maximum sequence length.")
parser.add_argument('--ngram', default=3, help="Maximum number of ngrams to mask.")
parser.add_argument('--max_predictions_per_seq', default=20, help="Maximum number of masked LM predictions per sequence.")
parser.add_argument('--random_seed', default=12345, help="Random seed for data generation.")
parser.add_argument('--dupe_factor', default=5, help="Number of times to duplicate the input data (with different masks).")
parser.add_argument('--masked_lm_prob', default=0.15, help="Masked LM probability.")
parser.add_argument('--short_seq_prob', default=0.1, help="Probability of creating sequences which are shorter than the maximum length.")


opt = parser.parse_args(args=[
    '--input_file', '1995_income',  
    '--output_file', 'MLP',
    '--spm_model_file', './wiki-ja_albert.model',
    '--vocab_file', './wiki-ja_albert.vocab',
    '--do_whole_word_mask', False,
    '--do_permutation', False,
    '--favor_shorter_ngram', False,
    '--random_next_sentenc', False
])


In [None]:
cache_model_folder = './model'
cache_data_folder = './data'

seed = 202105
main_path = Path('/home/jupyter/gogolook')
main_data_path = main_path / 'data' / 'sms' / 'experiment_data'
main_model_path = main_path / 'models'
main_record_folders_path =  main_path / 'tf_record_folders'

# general
saved_pytorch_model_dir = main_record_folders_path / 'saved_tf_model'
cached_pretarin_model_folder = main_record_folders_path / 'cache_pretrain_model'


# model
albert_zh_path = main_model_path / 'albert_zh'

# data
regex_file_type = '*.csv'
data_tag = 'extand_augment_and_trust_data' # basic_data, extand_augment_data
extand_data_tag = 'bleu_score_9'
valid_data_tag = 'extand_augment_and_trust_data'
test_data_tag = 'extand_augment_and_trust_data'

if data_tag == "basic_data":
    experiment_train_data_path = main_data_path / f'train_{data_tag}'
else:
    experiment_train_data_path = main_data_path / f'train_{data_tag}'  #/ extand_data_tag
    
experiment_valid_data_path = main_data_path / f'valid_{valid_data_tag}'
experiment_test_data_path = main_data_path / f'test_{test_data_tag}'

training_data_path = experiment_train_data_path
validation_data_path = experiment_valid_data_path
testing_data_path = experiment_test_data_path


In [None]:
import torch
from transformers import AutoModel, AutoTokenizer, BertJapaneseTokenizer
mecab_tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese", word_tokenizer_type="mecab", cache_dir=cache_model_folder)
## Input Japanese Text
line = "アンパサンド (&、英語名：) とは並立助詞「…と…」を意味する記号である。ラテン語の の合字で、Trebuchet MSフォントでは、と表示され \"et\" の合字であることが容易にわかる。"
mecab_inputs = mecab_tokenizer(line, return_tensors="pt")
print(mecab_tokenizer.decode(mecab_inputs['input_ids'][0]))


