In [1]:
import os

out = 'xlnet-base-bahasa-cased'
os.makedirs(out, exist_ok=True)

In [2]:
from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline

In [3]:
tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)
tokenizer.save_pretrained('xlnet-base-bahasa-cased')

('xlnet-base-bahasa-cased/spiece.model',
 'xlnet-base-bahasa-cased/special_tokens_map.json',
 'xlnet-base-bahasa-cased/added_tokens.json')

In [4]:
tokenizer = XLNetTokenizer.from_pretrained('./xlnet-base-bahasa-cased', do_lower_case = False)

In [6]:
# !transformers-cli convert --model_type xlnet \
#   --tf_checkpoint output-model/model.ckpt-300000 \
#   --config output-model/config.json \
#   --pytorch_dump_output xlnet-base-bahasa-cased

In [7]:
directory = 'xlnet-base-bahasa-cased'
config = XLNetConfig(f'{directory}/config.json')
config.vocab_size = 32000
config.d_inner = 3072
config.d_model = 768
config.n_head = 12
config.n_layer = 12

In [8]:
config

XLNetConfig {
  "architectures": null,
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": null,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "do_sample": false,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_ids": null,
  "ff_activation": "gelu",
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "num_beams": 1,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": null,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type

In [9]:
model = AutoModelWithLMHead.from_pretrained('./xlnet-base-bahasa-cased/pytorch_model.bin', config = config)

In [10]:
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [11]:
fill_mask('makan ayam dengan <mask>')

[{'sequence': 'makan ayam dengan sejenis<sep><cls>',
  'score': 0.08394942432641983,
  'token': 3038},
 {'sequence': 'makan ayam dengan dri<sep><cls>',
  'score': 0.04811568558216095,
  'token': 2777},
 {'sequence': 'makan ayam dengan kuy<sep><cls>',
  'score': 0.023890310898423195,
  'token': 7877},
 {'sequence': 'makan ayam dengan semestinya<sep><cls>',
  'score': 0.023234495893120766,
  'token': 6488},
 {'sequence': 'makan ayam dengan basically<sep><cls>',
  'score': 0.022633543238043785,
  'token': 22441}]

In [12]:
model.save_pretrained('xlnet-base-bahasa-cased')

In [13]:
# !transformers-cli upload ./xlnet-base-bahasa-cased

In [14]:
model = XLNetModel.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467039973.0, style=ProgressStyle(descri…




In [15]:
tokenizer = XLNetTokenizer.from_pretrained('huseinzol05/xlnet-base-bahasa-cased', do_lower_case = False)

In [16]:
import torch

In [17]:
input_ids = torch.tensor([tokenizer.encode("husein tk suka mkan ayam", add_special_tokens=True)])

In [18]:
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]
    
last_hidden_states


tensor([[[-1.2767,  1.1698,  0.1059,  ..., -3.4251, -1.1081, -1.8970],
         [ 4.7746,  1.6299, -2.1562,  ..., -1.9677, -0.8094, -0.2375],
         [ 4.6481, -1.1669, -3.3487,  ..., -1.6533, -2.2662,  2.5585],
         ...,
         [-1.8671,  2.4500, -1.3904,  ..., -1.0455, -0.4507, -1.0828],
         [ 3.3637,  1.3574, -2.3841,  ..., -1.8289,  7.4378, -1.6053],
         [ 2.4336,  2.5622, -4.5100,  ..., -1.1985, -2.3939,  2.5001]]])

In [19]:
model = AutoModelWithLMHead.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
fill_mask('makan ayam dengan <mask>')

[{'sequence': 'makan ayam dengan sejenis<sep><cls>',
  'score': 0.08394942432641983,
  'token': 3038},
 {'sequence': 'makan ayam dengan dri<sep><cls>',
  'score': 0.04811568558216095,
  'token': 2777},
 {'sequence': 'makan ayam dengan kuy<sep><cls>',
  'score': 0.023890310898423195,
  'token': 7877},
 {'sequence': 'makan ayam dengan semestinya<sep><cls>',
  'score': 0.023234495893120766,
  'token': 6488},
 {'sequence': 'makan ayam dengan basically<sep><cls>',
  'score': 0.022633543238043785,
  'token': 22441}]