In [80]:
import pandas as pd
dataframe2 = pd.read_csv("en_nta.txt", sep='\t')

In [81]:
dataframe2.columns = ["english_sentence"]

In [82]:
dataframe1 = pd.read_csv("id_nta.txt", sep='\t')

In [83]:
dataframe1.columns = ["Indonesian_sentence"]

In [88]:
dataframe1.head(1)

Unnamed: 0,Indonesian_sentence
0,a. Pendirian Bank dan informasi umum


In [89]:
df = pd.concat([dataframe1, dataframe2], axis=1, join='inner')

In [90]:
df.head(1)

Unnamed: 0,Indonesian_sentence,english_sentence
0,a. Pendirian Bank dan informasi umum,a. Establishment of the Bank and general information


In [91]:
import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)


  from ipykernel import kernelapp as app


In [92]:
df.head(1)


Unnamed: 0,Indonesian_sentence,english_sentence
0,a. Pendirian Bank dan informasi umum,a. Establishment of the Bank and general information


In [93]:
pd.isnull(df).sum()


Indonesian_sentence    0
english_sentence       0
dtype: int64

In [94]:
df=df[~pd.isnull(df['english_sentence'])]

In [95]:
df.drop_duplicates(inplace=True)

In [96]:
df=df.sample(n=2500,random_state=42)
df.shape


(2500, 2)

In [97]:
df['Indonesian_sentence']=df['Indonesian_sentence'].apply(lambda x: x.lower())
df['english_sentence']=df['english_sentence'].apply(lambda x: x.lower())

In [98]:
df['Indonesian_sentence']=df['Indonesian_sentence'].apply(lambda x: re.sub("'", '', x))
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub("'", '', x))

In [99]:
exclude = set(string.punctuation) 
df['Indonesian_sentence']=df['Indonesian_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [101]:
remove_digits = str.maketrans('', '', digits)
df['Indonesian_sentence']=df['Indonesian_sentence'].apply(lambda x: x.translate(remove_digits))
df['english_sentence']=df['english_sentence'].apply(lambda x: x.translate(remove_digits))
df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))
df['Indonesian_sentence']=df['Indonesian_sentence'].apply(lambda x: x.strip())
df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
df['Indonesian_sentence']=df['Indonesian_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [102]:
df['english_sentence'] = df['english_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [103]:
df.head(1)


Unnamed: 0,Indonesian_sentence,english_sentence
3089,a aset pajak tangguhan lanjutan,START_ a deferred tax assets continued _END


In [104]:
all_eng_words=set()
for eng in df['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_ind_words=set()
for ind in df['Indonesian_sentence']:
    for word in ind.split():
        if word not in all_ind_words:
            all_ind_words.add(word)


In [105]:
len(all_eng_words)

4612

In [106]:
len(all_ind_words)


4000

In [107]:
df['length_ind_sentence']=df['Indonesian_sentence'].apply(lambda x:len(x.split(" ")))
df['length_eng_sentence']=df['english_sentence'].apply(lambda x:len(x.split(" ")))

In [108]:
df.head(1)


Unnamed: 0,Indonesian_sentence,english_sentence,length_ind_sentence,length_eng_sentence
3089,a aset pajak tangguhan lanjutan,START_ a deferred tax assets continued _END,5,7


In [109]:
df[df['length_ind_sentence']>30].shape

(754, 4)

In [110]:
df=df[df['length_eng_sentence']<=20]
df=df[df['length_ind_sentence']<=20]


In [111]:
df.shape


(1212, 4)

In [112]:
print("maximum length of Indonesian Sentence ",max(df['length_ind_sentence']))
print("maximum length of English Sentence ",max(df['length_eng_sentence']))


maximum length of Indonesian Sentence  20
maximum length of English Sentence  20


In [113]:
max_length_src=max(df['length_ind_sentence'])
max_length_tar=max(df['length_eng_sentence'])


In [114]:
input_words = sorted(list(all_ind_words))
target_words = sorted(list(all_eng_words))
num_encoder_tokens = len(all_ind_words)
num_decoder_tokens = len(all_eng_words)
num_encoder_tokens, num_decoder_tokens

(4000, 4612)

In [115]:
num_decoder_tokens += 1 

In [116]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [117]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [118]:
df = shuffle(df)
df.head(2)

Unnamed: 0,Indonesian_sentence,english_sentence,length_ind_sentence,length_eng_sentence
1759,giro pada bank indonesia lanjutan,START_ current accounts with bank indonesia continued _END,5,8
1042,iv suratsurat berharga,START_ iv marketable securities _END,3,5


In [119]:
X, y = df['Indonesian_sentence'], df['english_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((969,), (243,))

In [120]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

In [121]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word]
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word]
                    if t>0:
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)


In [122]:
latent_dim=300

In [123]:
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [124]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [125]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [126]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 16
epochs = 50

In [127]:
import numpy as np

In [128]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

  """


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fd1188b9b10>

In [129]:
model.save_weights('nmt_weights.h5')

In [130]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2= dec_emb_layer(decoder_inputs) 
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) 
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [131]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_token_index['START_']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    return decoded_sentence

In [132]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [136]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Indonesian sentence:', X_train[k:k+1].values[0])
print('Actual English Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted English Translation:', decoded_sentence[:-4])


Input Indonesian sentence: susunan komite remunerasi dan nominasi pada tanggal desember dan adalah sebagai berikut
Actual Indonesian Translation:  the composition of the remuneration and nomination committee as of of december and are as follows 
Predicted English Translation:  the composition of the remuneration and nomina


In [138]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Indonesian sentence:', X_train[k:k+1].values[0])
print('Actual English Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted English Translation:', decoded_sentence[:-4])

Input Indonesian sentence: bersih
Actual English Translation:  fee and commission incomenet 
Predicted English Translation:  fee and commission incomenet 


In [139]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Indonesian sentence:', X_train[k:k+1].values[0])
print('Actual English Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted English Translation:', decoded_sentence[:-4])


Input Indonesian sentence: b dasar penyusunan laporan keuangan konsolidasian lanjutan
Actual English Translation:  b basis for preparation of the consolidated financial statements continued 
Predicted English Translation:  b basis for preparation of the consolidated finan


In [140]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Indonesian sentence:', X_train[k:k+1].values[0])
print('Actual English Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted English Translation:', decoded_sentence[:-4])


Input Indonesian sentence: b pertimbangan akuntansi yang penting dalam menerapkan kebijakan akuntansi perseroan
Actual English Translation:  b critical accounting judgments in applying the companys accounting policies 
Predicted English Translation:  b critical accounting judgments in applying the comp


In [141]:
#MarianMT Model

In [41]:
!pip install transformers



In [42]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-mfqi1jg5
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-mfqi1jg5
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [43]:
import nltk
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from transformers import MarianMTModel, MarianTokenizer

In [44]:
raw = pd.read_csv("id_nta.txt", sep='\t')

In [45]:
raw

Unnamed: 0,1. UMUM
0,a. Pendirian Bank dan informasi umum
1,PT Bank Pembangunan Daerah Jawa Timur Tbk ('Ba...
2,Peraturan Pemerintah Daerah tersebut disahkan ...
3,Perubahan status bentuk hukum tersebut sesuai ...
4,1. UMUM (lanjutan)
...,...
4259,1.a. Pendirian dan Informasi Umum PT Bumi Reso...
4260,Perubahan terakhir Anggaran Dasar Perusahaan a...
4261,Perubahan terakhir Anggaran Dasar terkait deng...
4262,"Sesuai dengan Anggaran Dasar Perusahaan, ruang..."


In [46]:
raw.columns = ["Text"]

In [47]:
raw

Unnamed: 0,Text
0,a. Pendirian Bank dan informasi umum
1,PT Bank Pembangunan Daerah Jawa Timur Tbk ('Ba...
2,Peraturan Pemerintah Daerah tersebut disahkan ...
3,Perubahan status bentuk hukum tersebut sesuai ...
4,1. UMUM (lanjutan)
...,...
4259,1.a. Pendirian dan Informasi Umum PT Bumi Reso...
4260,Perubahan terakhir Anggaran Dasar Perusahaan a...
4261,Perubahan terakhir Anggaran Dasar terkait deng...
4262,"Sesuai dengan Anggaran Dasar Perusahaan, ruang..."


In [48]:
len(raw)

4264

In [49]:
raw["Text"][0]

'a. Pendirian Bank dan informasi umum'

In [60]:
t = []
for i in range(len(raw)):
  if len(raw["Text"][i]) <= 256:
    t.append(raw["Text"][i])

In [61]:
df = pd.DataFrame(t, columns =['Text'])


In [62]:
df

Unnamed: 0,Text
0,a. Pendirian Bank dan informasi umum
1,1. UMUM (lanjutan)
2,a. Pendirian Bank dan informasi umum (lanjutan)
3,Entitas induk terakhir dari Bank adalah Pemeri...
4,Bank memperoleh ijin untuk beroperasi sebagai ...
...,...
3191,38. TANGGUNG JAWAB MANAJEMEN DAN PERSETUJUAN A...
3192,Penyusunan dan penyajian wajar laporan keuanga...
3193,1. UMUM
3194,Perubahan terakhir Anggaran Dasar terkait deng...


In [63]:
type(df)

pandas.core.frame.DataFrame

In [64]:
df["Language"] = 'Indonesian'


In [65]:
df

Unnamed: 0,Text,Language
0,a. Pendirian Bank dan informasi umum,Indonesian
1,1. UMUM (lanjutan),Indonesian
2,a. Pendirian Bank dan informasi umum (lanjutan),Indonesian
3,Entitas induk terakhir dari Bank adalah Pemeri...,Indonesian
4,Bank memperoleh ijin untuk beroperasi sebagai ...,Indonesian
...,...,...
3191,38. TANGGUNG JAWAB MANAJEMEN DAN PERSETUJUAN A...,Indonesian
3192,Penyusunan dan penyajian wajar laporan keuanga...,Indonesian
3193,1. UMUM,Indonesian
3194,Perubahan terakhir Anggaran Dasar terkait deng...,Indonesian


In [66]:
def clean_text(text):
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\n\n", " ", text)
    text = text.strip(" ")
    text = re.sub(' +',' ', text).strip()
    return text

In [67]:
def translator(text):
  input_ids = tokenizer.encode(text, return_tensors="pt", padding=True)
  outputs = model.generate(input_ids)
  decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return decoded_text

In [68]:
!pip install sentencepiece



In [69]:
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-id-en")
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")

In [70]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [71]:
df["Clean_Text"] = df['Text'].map(lambda text: clean_text(text))


In [72]:
df

Unnamed: 0,Text,Language,Clean_Text
0,a. Pendirian Bank dan informasi umum,Indonesian,a. Pendirian Bank dan informasi umum
1,1. UMUM (lanjutan),Indonesian,1. UMUM (lanjutan)
2,a. Pendirian Bank dan informasi umum (lanjutan),Indonesian,a. Pendirian Bank dan informasi umum (lanjutan)
3,Entitas induk terakhir dari Bank adalah Pemeri...,Indonesian,Entitas induk terakhir dari Bank adalah Pemeri...
4,Bank memperoleh ijin untuk beroperasi sebagai ...,Indonesian,Bank memperoleh ijin untuk beroperasi sebagai ...
...,...,...,...
3191,38. TANGGUNG JAWAB MANAJEMEN DAN PERSETUJUAN A...,Indonesian,38. TANGGUNG JAWAB MANAJEMEN DAN PERSETUJUAN A...
3192,Penyusunan dan penyajian wajar laporan keuanga...,Indonesian,Penyusunan dan penyajian wajar laporan keuanga...
3193,1. UMUM,Indonesian,1. UMUM
3194,Perubahan terakhir Anggaran Dasar terkait deng...,Indonesian,Perubahan terakhir Anggaran Dasar terkait deng...


In [73]:
df

Unnamed: 0,Text,Language,Clean_Text
0,a. Pendirian Bank dan informasi umum,Indonesian,a. Pendirian Bank dan informasi umum
1,1. UMUM (lanjutan),Indonesian,1. UMUM (lanjutan)
2,a. Pendirian Bank dan informasi umum (lanjutan),Indonesian,a. Pendirian Bank dan informasi umum (lanjutan)
3,Entitas induk terakhir dari Bank adalah Pemeri...,Indonesian,Entitas induk terakhir dari Bank adalah Pemeri...
4,Bank memperoleh ijin untuk beroperasi sebagai ...,Indonesian,Bank memperoleh ijin untuk beroperasi sebagai ...
...,...,...,...
3191,38. TANGGUNG JAWAB MANAJEMEN DAN PERSETUJUAN A...,Indonesian,38. TANGGUNG JAWAB MANAJEMEN DAN PERSETUJUAN A...
3192,Penyusunan dan penyajian wajar laporan keuanga...,Indonesian,Penyusunan dan penyajian wajar laporan keuanga...
3193,1. UMUM,Indonesian,1. UMUM
3194,Perubahan terakhir Anggaran Dasar terkait deng...,Indonesian,Perubahan terakhir Anggaran Dasar terkait deng...


In [74]:
df['Machine_Translation'] = df["Clean_Text"].map(lambda x: translator(x)).copy()

In [75]:
cols = ["Text", "Machine_Translation"]

In [76]:
df

Unnamed: 0,Text,Language,Clean_Text,Machine_Translation
0,a. Pendirian Bank dan informasi umum,Indonesian,a. Pendirian Bank dan informasi umum,a. Bank establishment and general information
1,1. UMUM (lanjutan),Indonesian,1. UMUM (lanjutan),1. COMMON (Continues)
2,a. Pendirian Bank dan informasi umum (lanjutan),Indonesian,a. Pendirian Bank dan informasi umum (lanjutan),a. Bank establishment and general information ...
3,Entitas induk terakhir dari Bank adalah Pemeri...,Indonesian,Entitas induk terakhir dari Bank adalah Pemeri...,The last parent entity of the Bank is the East...
4,Bank memperoleh ijin untuk beroperasi sebagai ...,Indonesian,Bank memperoleh ijin untuk beroperasi sebagai ...,The bank obtained permission to operate as a d...
...,...,...,...,...
3191,38. TANGGUNG JAWAB MANAJEMEN DAN PERSETUJUAN A...,Indonesian,38. TANGGUNG JAWAB MANAJEMEN DAN PERSETUJUAN A...,38. &amp;&gt;&lt;/i&gt; &lt;i&gt;&lt;/i&gt; &l...
3192,Penyusunan dan penyajian wajar laporan keuanga...,Indonesian,Penyusunan dan penyajian wajar laporan keuanga...,The natural drafting and presentation of the f...
3193,1. UMUM,Indonesian,1. UMUM,1. COMMON
3194,Perubahan terakhir Anggaran Dasar terkait deng...,Indonesian,Perubahan terakhir Anggaran Dasar terkait deng...,The last change in basic budgets related to th...
