In [None]:
# !pip3 install kobert-transformers
# !pip3 install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
# !pip3 install gensim
# !pip3 install konlpy

In [1]:
import pandas as pd
import numpy as np
import ast
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from kobert_tokenizer import KoBERTTokenizer
from transformers import BertTokenizer, BertModel, DistilBertModel


In [2]:
df_naver = pd.read_csv('../data/filtered_naver.csv')
df_daum = pd.read_csv('../data/filtered_daum.csv')

In [3]:
df_naver = df_naver[['url', 'title', 'publication_date', 'content', 'platform_id', 'entities']]
df_daum = df_daum[['article_url', 'title', 'publication_date', 'content', 'platform_id', 'entities']]
df_daum.rename(columns={'article_url': 'url'}, inplace=True)

In [5]:
df = pd.concat([df_naver, df_daum], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86970 entries, 0 to 86969
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   url               86970 non-null  object
 1   title             86970 non-null  object
 2   publication_date  86970 non-null  object
 3   content           86970 non-null  object
 4   platform_id       86970 non-null  object
 5   entities          86970 non-null  object
dtypes: object(6)
memory usage: 4.0+ MB


In [6]:
df['entities'] = df['entities'].apply(lambda x: ast.literal_eval(x))
# print(ast.literal_eval(df['entities'].iloc[0]))

In [7]:
# Using Word2Vec to train extracted
model = Word2Vec(sentences=df['entities'], vector_size=100, window = 5, min_count=5, workers=1, sg=0)
model.wv.vectors.shape

(8644, 100)

In [8]:
# Function to vectorize a single sentence
def vectorize_sentence(sentence, model):
    # Filter words that are in the model's vocabulary
    words_in_vocab = [word for word in sentence if word in model.wv]
    if not words_in_vocab:
        return np.zeros(model.vector_size)  # Return a zero vector if no words are in the vocabulary
    # Compute the average of the word vectors
    return np.mean([model.wv[word] for word in words_in_vocab], axis=0)

In [9]:
# Vectorize all sentences in the DataFrame
df['sentence_vector'] = df['entities'].apply(lambda sentence: vectorize_sentence(sentence, model))

In [10]:
# Example: Access the vector for the first sentence
first_sentence_vector = df['sentence_vector'].iloc[0]
print(first_sentence_vector)

[-0.04816673  0.2127186  -0.01453179 -0.03493057  0.03977986 -0.49994612
  0.19581053  0.46102628 -0.26858893 -0.20721993 -0.10559356 -0.28797114
 -0.21497738  0.01436195  0.09616459 -0.26236767 -0.01631447 -0.32519728
  0.08266761 -0.4751123   0.23728502  0.10821801  0.30702502 -0.22461262
  0.04133242 -0.01995973 -0.08435918  0.03396496 -0.32426602 -0.10741277
  0.41913795 -0.03126465  0.18564647 -0.2999481  -0.02468124  0.2415332
  0.06063355 -0.26852983 -0.22428298 -0.31046468  0.03947989 -0.31375143
 -0.11042176  0.03847359  0.18404472 -0.11118976 -0.27571586 -0.017821
  0.01570749  0.25077862  0.15949066 -0.14106984 -0.03735182  0.05808438
 -0.1240892   0.2094386   0.22478369  0.1293802  -0.1398439   0.05922464
  0.0210564   0.17840818 -0.03111822 -0.04661287 -0.2330937   0.20819831
 -0.0195927   0.265367   -0.20482512  0.19415417 -0.24649572  0.29874712
  0.21585374 -0.08399407  0.3153918   0.04311277 -0.08530561  0.11351166
 -0.22085941 -0.09046302 -0.20125493 -0.14566287 -0.16

In [11]:
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertModel.from_pretrained('monologg/kobert')
# model = DistilBertModel.from_pretrained('monologg/distilkobert')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [13]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
   

In [17]:
# Function to get a single vector from BERT embeddings (e.g., mean pooling)
def get_pooled_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over the sequence length dimension
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()


In [4]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

CUDA available: True
GPU Name: NVIDIA GeForce GTX 1660 SUPER


In [15]:
df_sample = df.sample(100)

In [18]:
# Get embeddings for each row
df_sample['content_vector'] = df_sample['content'].apply(get_pooled_bert_embedding)

In [51]:
# df_sample.to_csv('../data/embedded_sample.csv', index=False)

In [19]:
# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['content']
        inputs = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        return {key: value.squeeze(0) for key, value in inputs.items()}

In [20]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertModel.from_pretrained('monologg/kobert')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
   

In [21]:
# Create dataset and dataloader
dataset = TextDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

In [22]:
# Function to get pooled BERT embeddings for a batch
def get_batch_embeddings(batch):
    inputs = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over the sequence length dimension
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

In [23]:
# Process the data in batches
content_vectors = []
for batch in dataloader:
    batch_embeddings = get_batch_embeddings(batch)
    content_vectors.extend(batch_embeddings)

In [24]:
df['content_vector'] = content_vectors

In [25]:
df['embedding'] = df.apply(lambda row: np.concatenate((row['sentence_vector'], row['content_vector'])), axis=1)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86970 entries, 0 to 86969
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   url               86970 non-null  object
 1   title             86970 non-null  object
 2   publication_date  86970 non-null  object
 3   content           86970 non-null  object
 4   platform_id       86970 non-null  object
 5   entities          86970 non-null  object
 6   sentence_vector   86970 non-null  object
 7   content_vector    86970 non-null  object
 8   embedding         86970 non-null  object
dtypes: object(9)
memory usage: 6.0+ MB


In [27]:
df.to_csv('../data/embedded.csv', index=False)