# **Cross-lingual Embedding**

In this task will be generated cross lingual word embedding for Bahasa Jawa and Bahasa Indonesia. Cross lingual word embedding created from monolingual corpus. The corpus used in this task are collections of wikipedia articles. The output of this task will be used on bilingual dictionaries generation task.

## Data Preparation

In [None]:
from __future__ import print_function

import logging
import os.path
import sys

from gensim.corpora import WikiCorpus

program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English
INFO:ipykernel_launcher.py:running /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py -f /root/.local/share/jupyter/runtime/kernel-759e05bd-5736-405a-a60b-cf70ee6a450d.json


### Merge Corpus

In [None]:
def load_merge(filename1, filename2):
    f1 = open(filename1, 'r+', encoding='utf-8')
    doc = f1.readlines()
    f1.close()
    
    f2 = open(filename2, 'r+', encoding='utf-8')
    doc = doc+f2.readlines()
    f2.close()
    
    return doc

In [None]:
merged_corpus = load_merge('jvwiki.txt', 'suwiki.txt')

print(len(merged_corpus))

53053


### Shuffle Corpus

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize
import numpy as np

In [None]:
def to_tokens(data):
    result = []
    for sent in data:
        result += word_tokenize(sent)
    
    return result

def shuffle(data):
    tokens = to_tokens(data)
    x = np.random.permutation(tokens)
    
    return x.tolist()

In [None]:
shuffled_corpus = shuffle(merged_corpus)

NameError: ignored

In [None]:
def divide_chunks(l, n): 
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n]

shuffled_corpus = list(divide_chunks(shuffled_corpus, 240))
print(len(shuffled_corpus))

49130


In [None]:
def generate_texts(corpus, output):
  out = open(output, 'w', encoding='utf-8')

  for text in corpus:
    p = " ".join(text) + "\n"
    out.write(p)
  print("all saved to new cospus file")

In [None]:
generate_texts(shuffled_corpus, "merged_corpus.txt")

all saved to new cospus file


## BERT

### Loading Pre-trained BERT

In [None]:
!pip install transformers



In [None]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import matplotlib.pyplot as plt
% matplotlib inline

### bert-multilingual-uncased

In [None]:
# Load pre-trained model and tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained("bert-base-multilingual-uncased", output_hidden_states = True,)

In [None]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(105879, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

### Generate Input Text

In [None]:
def create_input_text(corpus):
  text = ""
  for i in range(len(corpus)):
    if i == 0:
      x = "[CLS] " + corpus[i] + " [SEP]"
    else:
      x = " " + corpus[i] + " [SEP]"
    text += x
  
  return text

In [None]:
import pandas as pd

jv_texts = pd.read_csv('merged_corpus.txt', sep="\n", header=None)

In [None]:
jv_texts.head()

Unnamed: 0,0
0,hiji melayu jenis karo basa Kategori Sawetara ...
1,wite atawa kaluar Indomie di héjo loba the Ieu...
2,wayah ieu Vansanten like tina analogy bisa CEO...
3,Nana basa ku pangaweruh dodolan Artikel perang...
4,kidul the DPR Hida kualitas iku operasi astéro...


In [None]:
corpus = jv_texts[0]

In [None]:
tmp = corpus[0]

### Tokenization

In [None]:
# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(tmp)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

In [None]:
# Display the words with their indeces.
print("token indeces: ")
for tup in zip(tokenized_text[:10], indexed_tokens[:10]):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

token indeces: 
hiji         26,131
melayu       27,466
jenis        23,430
karo         23,223
basa         15,707
kategori     33,272
saw          16,289
##etar       90,240
##a          10,112
zion         37,074


In [None]:
def create_segment_id(list_tokens):
  segment_id = []
  i = 0

  for token in list_tokens:
    segment_id.append(i)
    if token == "[SEP]":
      i += 1
  
  return segment_id

In [None]:
segments_ids = [1] * len(tokenized_text)

### Extracting Embeddings

In [None]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [None]:
tokens_tensor.size()

torch.Size([1, 373])

In [None]:
segments_tensors.size()

torch.Size([1, 373])

In [None]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers.
def collect_hidden_states(tokens, segments):
  with torch.no_grad():
    outputs = model(tokens, segments)
    return outputs[2]

In [None]:
hidden_states = collect_hidden_states(tokens_tensor, segments_tensors)

### Create token embedding

In [None]:
# `hidden_states` is a Python list.
print('      Type of hidden_states: ', type(hidden_states))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size())

      Type of hidden_states:  <class 'tuple'>
Tensor shape for each layer:  torch.Size([1, 373, 768])


In [None]:
def create_embeddings(hidden_states):
  # Concatenate the tensors for all layers.
  token_embeddings = torch.stack(hidden_states, dim=0)
  # Remove dimension 1, the "batches".
  token_embeddings = torch.squeeze(token_embeddings, dim=1)
  # Swap dimensions 0 and 1.
  token_embeddings = token_embeddings.permute(1,0,2)

  return token_embeddings

In [None]:
token_embeddings = create_embeddings(hidden_states)

token_embeddings.size()

torch.Size([373, 13, 768])

In [None]:
def create_word_vectors(tokens_embeddings):
  # Stores the token vectors, with shape [22 x 768]
  token_vecs_sum = []

  # `token_embeddings` is a [22 x 12 x 768] tensor.
  # For each token in the sentence...
  for token in tokens_embeddings:
    
    # `token` is a [12 x 768] tensor
    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[:-4], dim=0)

    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)
  
  return token_vecs_sum

In [None]:
token_vecs_sum = create_word_vectors(token_embeddings)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))
'hiji', 'melayu', 'jenis', 'karo', 'basa', 'kategori', 'sawetara', 'sabab', 'karakter', 'service', 'super', 'arca', 'planet', 'lan', 'umum', 'disebut', 'presasti', 'kagandheng', 'jagung', 'jerman', 'ieu', 'nasional', 'dinasti', 'kang', 'dene', 'time', 'eksentrisitas', 'web', 'teori', 'lalampahan', 'balak', 'tanpa', 'suku', 'suket', 'dagang', 'laboratorium', 'teu', 'wonten', 'manawa', 'ing', 'masalah', 'aya', 'jupiter', 'panjang', 'gambar', 'dadi', 'sawijining', 'kulina', 'nalika', 'punika', 'cacah', 'bantu', 'amarga', 'kacatet', 'misuwur', 'wis', 'data', 'duwe', 'taun', 'jepang', 'jumlah', 'iku', 'jeung', 'wanara', 'saben'

Shape is: 373 x 768


In [None]:
tokenized_text[0]

'hiji'

In [None]:
words = ['hiji', 'melayu', 'jenis', 'karo', 'basa', 'kategori', 'sawetara', 'sabab', 'karakter', 'service', 'super', 'arca', 'planet', 'lan', 'umum', 'disebut', 'presasti', 'kagandheng', 'jagung', 'jerman', 'ieu', 'nasional', 'dinasti', 'kang', 'dene', 'time', 'eksentrisitas', 'web', 'teori', 'lalampahan', 'balak', 'tanpa', 'suku', 'suket', 'dagang', 'laboratorium', 'teu', 'wonten', 'manawa', 'ing', 'masalah', 'aya', 'jupiter', 'panjang', 'gambar', 'dadi', 'sawijining', 'kulina', 'nalika', 'punika', 'cacah', 'bantu', 'amarga', 'kacatet', 'misuwur', 'wis', 'data', 'duwe', 'taun', 'jepang', 'jumlah', 'iku']
index_1 = []

### Similarity

In [None]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word -pergi- and -lunga-
diff = 1 - cosine(token_vecs_sum[2], token_vecs_sum[3])

print('Vector similarity for -pergi- and -lunga-:  %.2f' % diff)

Vector similarity for -pergi- and -lunga-:  0.42


In [None]:
def most_similar(idx, token_vecs_sum):
  max = 0
  max_idx = 0
  for i in range(0, len(token_vecs_sum)):
    if (idx != i):
      diff = 1 - cosine(token_vecs_sum[idx], token_vecs_sum[i])
      if diff > max:
        max = diff
        max_idx = i

  return (max_idx, max)

In [None]:
sim_idx = []
sim_val = []
for idx in range(0, len(token_vecs_sum)):
  x, y = most_similar(idx, token_vecs_sum)
  sim_idx.append(x)
  sim_val.append(y)

In [None]:
# Writing to an excel  
# sheet using Python 
import xlwt 
from xlwt import Workbook 
  
# Workbook is created 
wb = Workbook() 
  
# add_sheet is used to create sheet. 
sheet1 = wb.add_sheet('BERT')

sheet1.write(0, 0, 'word')
sheet1.write(0, 1, 'similar')
sheet1.write(0, 2, 'distance')

for j in range(0,len(tokenized_text)):
  sheet1.write(j+1, 0, tokenized_text[j])

i = 1
j = 1

for idx in sim_idx:
  sheet1.write(j, i, idx)
  j += 1

i = 2
j = 1

for val in sim_val:
  sheet1.write(j, i, val)
  j += 1
 
wb.save('Laporan_KP.xls')