In [1]:
"""
load data
"""
import json, requests
import numpy as np

data_path = 'https://raw.githubusercontent.com/KLUE-benchmark/KLUE/main/klue_benchmark/ynat-v1.1/ynat-v1.1_train.json'
url = requests.get(data_path)

dataset = np.array([['title', 'label', 'predefined_news_category']])

for line in json.loads(url.text):
    data = np.array([[line['title'], line['label'], line['predefined_news_category']]])
    dataset = np.concatenate((dataset, data), axis=0)


In [2]:
"""
tokenizer
"""
import pandas as pd
from collections import Counter

dataframe = pd.DataFrame(dataset[1:], columns=dataset[0]) # 9108 samples
if dataframe.isnull().values.any():
    dataframe.dropna_()

# tokenizer: 띄어쓰기 기준 
tokenizer = [word.split(' ') for word in dataframe['title']]

In [3]:
"""
vocab
"""
counter = Counter(sum(tokenizer, [])) # 31851 words
vocab_sorted = sorted(counter.items(), key = lambda x:x[1], reverse = True)

vocab = {}
for i, (word, frequency) in enumerate(vocab_sorted):
    vocab[word] = i+1
vocab['OOV'] = len(vocab) + 1


encoded_sentences = []
for sentence in tokenizer:
    encoded_sentence = []
    for word in sentence:
        try:
            encoded_sentence.append(vocab[word])
        except KeyError:
            encoded_sentence.append(vocab['OOV'])
    encoded_sentences.append(encoded_sentence)

In [4]:
"""
padding
"""
max_len = max(len(item) for item in encoded_sentences) # 12
for sentence in encoded_sentences:
    while len(sentence) < max_len:
        sentence.append(0)

In [5]:
"""
SGNS data processing
"""
from itertools import permutations, product
window_size = 5

skip_gram = []
for encoded_sentence in encoded_sentences:
    full_permutations = list(set(permutations(encoded_sentence,2)))
    window_permutations = []
    for i in range(len(encoded_sentence)):
        left = max(0, i-window_size)
        right = min(i+window_size, max_len)+1
        remove_i_list = encoded_sentence[left:i] + encoded_sentence[i+1:right]
        window_permutations.append(list(product([encoded_sentence[i]], remove_i_list)))

    for full_permutation in full_permutations:
        if full_permutation in list(set(sum(window_permutations, []))):
            skip_gram.append(list(full_permutation) + [1])
        else:
            skip_gram.append(list(full_permutation) + [0])

In [None]:
"""
embedding model-SGNS
"""