<a href="https://colab.research.google.com/github/rezu98/colab/blob/main/Colaboratory%EC%97%90_%EC%98%A4%EC%8B%A0_%EA%B2%83%EC%9D%84_%ED%99%98%EC%98%81%ED%95%A9%EB%8B%88%EB%8B%A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import *
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import sentencepiece as spm

In [None]:
import logging
import os
import unicodedata
from shutil import copyfile

from transformers import PreTrainedTokenizer


logger = logging.getLogger(__name__)

VOCAB_FILES_NAMES = {"vocab_file": "tokenizer_78b3253a26.model",
                     "vocab_txt": "vocab.txt"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model"
    },
    "vocab_txt": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt"
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "monologg/kobert": 512,
    "monologg/kobert-lm": 512,
    "monologg/distilkobert": 512
}

PRETRAINED_INIT_CONFIGURATION = {
    "monologg/kobert": {"do_lower_case": False},
    "monologg/kobert-lm": {"do_lower_case": False},
    "monologg/distilkobert": {"do_lower_case": False}
}

SPIECE_UNDERLINE = u'▁'


class KoBertTokenizer(PreTrainedTokenizer):
    """
        SentencePiece based tokenizer. Peculiarities:
            - requires `SentencePiece `_
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
            self,
            vocab_file,
            vocab_txt,
            do_lower_case=False,
            remove_space=True,
            keep_accents=False,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            **kwargs):
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs
        )

        # Build vocab
        self.token2idx = dict()
        self.idx2token = []
        with open(vocab_txt, 'r', encoding='utf-8') as f:
            for idx, token in enumerate(f):
                token = token.strip()
                self.token2idx[token] = idx
                self.idx2token.append(token)

        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        self.vocab_txt = vocab_txt

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

    @property
    def vocab_size(self):
        return len(self.idx2token)

    def get_vocab(self):
        return dict(self.token2idx, **self.added_tokens_encoder)

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')

        if not self.keep_accents:
            outputs = unicodedata.normalize('NFKD', outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs

    def _tokenize(self, text, return_unicode=True, sample=False):
        """ Tokenize a string. """
        text = self.preprocess_text(text)

        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
        return self.token2idx.get(token, self.token2idx[self.unk_token])

    def _convert_id_to_token(self, index, return_unicode=True):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""
        return self.idx2token[index]

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A KoBERT sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model
        Returns:
            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A KoBERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

        # 1. Save sentencepiece model
        out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
            copyfile(self.vocab_file, out_vocab_model)

        # 2. Save vocab.txt
        index = 0
        out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
        with open(out_vocab_txt, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1

        return out_vocab_model, out_vocab_txt

In [None]:
pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
from kobert_tokenizer import KoBERTTokenizer
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
tokenizer.encode("한국어 모델을 공유합니다.")
[2, 4958, 6855, 2046, 7088, 1050, 7843, 54, 3]

In [None]:
print(tokenizer.encode("보는내내 그대로 들어맞는 예측 카리스마 없는 악역"))
print(tokenizer.tokenize("보는내내 그대로 들어맞는 예측 카리스마 없는 악역"))

In [None]:
print(tokenizer.tokenize("금융통화위원회는 다음 통화정책방향 결정시까지 한국은행 기준금리를 현 수준(3.50%)에서 유지하여 통화정책을 운용하기로 하였다. 물가상승률이 기조적인 둔화 흐름을 이어갈 것으로 전망되지만 주요국의 통화긴축 기조 장기화, 지정학적 리스크 증대 등으로 물가 및 성장 전망 경로의 불확실성이 크게 높아진 가운데 물가상승률의 둔화 속도가 당초 예상보다 완만해질 것으로 전망되고, 가계부채의 증가 흐름도 지켜볼 필요가 있는 만큼 현재의 긴축 기조를 유지하는 것이 적절하다고 보았다. 추가 인상 필요성은 대내외 정책 여건의 변화를 점검하면서 판단해 나갈 것이다. 세계경제는 주요국의 통화긴축 기조 장기화, 이스라엘·하마스 사태 등의 영향으로 경기 및 인플레이션 흐름과 관련한 불확실성이 증대되었다. 글로벌 경기는 성장세 둔화가 이어질 것으로 전망되며, 주요국 인플레이션은 점차 둔화되고 있지만 여전히 높은 수준인 가운데 국제유가 상승 등으로 상방 리스크가 증대되었다. 국제금융시장에서는 국채금리가 큰 폭 상승하고 미 달러화가 상당폭 강세를 나타내는 등 주요 가격변수의 변동성이 확대되었다. 앞으로 세계경제와 국제금융시장은 국제유가 움직임 및 글로벌 인플레이션의 둔화 흐름, 주요국의 통화정책 변화 및 파급효과, 이스라엘·하마스 사태의 전개양상 등에 영향받을 것으로 보인다. 국내경제는 소비 회복세가 다소 더딘 모습이지만 수출 부진이 완화되면서 성장세가 완만한 개선 흐름을 이어갔다. 고용은 낮은 실업률과 견조한 취업자수 증가가 이어지는 등 전반적으로 양호한 상황이다. 앞으로 국내경제는 수출 부진 완화로 성장세가 점차 개선되면서 금년 성장률도 지난 8월 전망치(1.4%)에 대체로 부합할 것으로 예상된다. 다만 지정학적 리스크 증대, 주요국의 통화긴축 기조 장기화 등의 영향으로 향후 성장 경로의 불확실성이 높아진 것으로 판단된다. 소비자물가는 에너지 및 농산물 가격 상승 등으로 9월중 상승률이 3.7%로 전월보다 높아졌지만, 근원인플레이션율(식료품 및 에너지 제외 지수)과 단기 기대인플레이션율은 모두 9월중 3.3%로 전월과 같은 수준을 나타내었다. 앞으로 소비자물가 상승률은 금년말에는 3%대 초반으로 낮아지고 내년에도 완만한 둔화 흐름을 이어갈 것으로 보인다. 다만 높아진 국제유가와 환율의 파급영향, 이스라엘·하마스 사태 등으로 물가의 상방 리스크가 높아짐에 따라 소비자물가 상승률이 목표수준으로 수렴하는 시기도 당초 예상보다 늦춰질 가능성이 커진 것으로 판단된다. 근원물가도 수요압력 약화 등으로 기조적인 둔화 흐름을 이어가겠으나 누적된 비용인상 압력의 파급영향 지속 등으로 둔화 속도는 당초 예상보다 완만해질 가능성이 높은 것으로 판단된다. 금융·외환시장은 미 연준의 높은 정책금리 장기화 시사, 지정학적 리스크 증대 등으로 변동성이 확대된 가운데 장기 국고채 금리와 원/달러 환율이 상당폭 상승하고 주가는 하락하였다. 일부 비은행부문의 리스크는 진정되는 모습이다. 주택가격은 수도권을 중심으로 상승세가 이어졌으며 가계대출은 주택관련대출을 중심으로 증가세가 지속되었다. 금융통화위원회는 앞으로 성장세를 점검하면서 중기적 시계에서 물가상승률이 목표수준에서 안정될 수 있도록 하는 한편 금융안정에 유의하여 통화정책을 운용해 나갈 것이다. 국내경제는 성장세가 점차 개선되는 가운데 정책 여건의 불확실성도 높아진 상황이다. 따라서 물가안정에 중점을 두고 긴축 기조를 상당기간 지속하면서 추가 인상 필요성을 판단해 나갈 것이다. 이 과정에서 인플레이션 둔화 흐름, 금융안정 측면의 리스크와 성장의 하방위험, 가계부채 증가 추이, 주요국의 통화정책 변화, 지정학적 리스크의 전개양상 등을 면밀히 점검해 나갈 것이다."))

In [None]:
!pip install konlpy

In [None]:
import pandas as pd
from konlpy.tag import Okt

In [None]:
# Load the KoSenticNet sentiment dictionary
kosenticnet_df = pd.read_csv('path_to_kosenticnet.csv')

In [None]:
# Initialize the KoNLPy tagger
okt = Okt()

# Example sentence
sentence = "이 영화는 정말 멋있어요."

# Tokenize the sentence
tokens = okt.morphs(sentence)

In [None]:
print(okt.pos(u'이 밤 그날의 반딧불을 당신의 창 가까이 보낼게요'))

In [None]:
# Load the sentiment data from kosenticnet.py
kosenticnet_data = {ksenticnet["가게"] = ['0', '0.124', '-0.05', '0.203', '#interest', '#admiration', 'positive', '0.09', '매점', '상점', '판매점']}

# Extract the sentiment values and create a dictionary
sentiment_dictionary = {}
for word, data in kosenticnet_data.items():
    sentiment_value = data[0]
    sentiment_dictionary[word] = float(sentiment_value)

# Example usage
sample_word = "불확실성"
if sample_word in sentiment_dictionary:
    sentiment_score = sentiment_dictionary[sample_word]
    print(f"The sentiment score for '{sample_word}' is: {sentiment_score}")
else:
    print(f"No sentiment score found for '{sample_word}' in the dictionary.")

# You can further save the sentiment dictionary to a file for later use
# For example, to save the dictionary to a CSV file
df = pd.DataFrame(list(sentiment_dictionary.items()), columns=['Word', 'Sentiment_Score'])
df.to_csv('custom_sentiment_dictionary.csv', index=False)

In [None]:
import re

# Example string
example_string = "가게0"

# Split the string into word and number
match = re.match(r'(\D+)(\d+)', example_string)

if match:
    word = match.group(1)
    number = match.group(2)
    print(f"Word: {word}, Number: {number}")
else:
    print("Pattern not found in the string.")

In [None]:
# Load the CSV file
df = pd.read_csv('ccc.csv', header=None, names=['text'])

In [86]:
# Load the CSV file
df = pd.read_csv('ccc.csv', header=None, names=['word', 'sentiment'])

# Display the dataframe
print(df)

      word  sentiment
0      가게       0.000
1      가격      -0.003
2      가계       0.000
3      가곡       0.045
4      가공       0.000
...    ...        ...
5460  희석하       0.000
5461   희열       0.635
5462   힐난      -0.540
5463  힐난하      -0.700
5464    힘       0.775

[5465 rows x 2 columns]


In [96]:
!pip install nltk
!pip install vaderSentiment

import nltk
nltk.download('vader_lexicon')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[31mERROR: Operation cancelled by user[0m[31m


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [101]:
# Example text
sample_text = "The Monetary Policy Board of the Bank of Korea decided today to leave the Base Rate unchanged at 3.50% for the intermeeting period. Although inflation is projected to continue its underlying trend of a slowdown, uncertainties regarding the future path of inflation and growth have risen significantly due to a prolongation of restrictive monetary policy stances in major countries and heightened geopolitical risks. In addition, it is forecast that the pace of inflation slowdown will moderate more than previously expected, and it is necessary to monitor household debt growth. The Board, therefore, sees that it is appropriate to maintain its current restrictive policy stance. Regarding the need to raise the Base Rate further, the Board will make a judgement while assessing the changes in domestic and external policy conditions. The currently available information suggests that uncertainties regarding economic and inflationary trends have increased across the global economy, driven by a prolongation of restrictive monetary policy stances in major countries and by the Israel-Hamas conflict. Global economic growth is projected to continue slowing. Inflation in major countries still remains high, though falling gradually, and upside risks have increased due to the rise in global oil prices. In global financial markets, volatility of major price variables has increased with government bond yields rising significantly and with the U.S. dollar strengthening considerably. Looking ahead, the Board sees global economic growth and global financial markets as likely to be affected by the movements of global oil prices and the global inflation slowdown, by monetary policy changes in major countries and their effects, and by developments in the Israel-Hamas conflict. Domestic economic growth has continued to improve at a modest pace owing to the easing of sluggishness in exports, although the recovery in private consumption has been somewhat slow. Labor market conditions have been generally favorable, as both a low unemployment rate and a robust increase in the number of persons employed have continued. Going forward, domestic economic growth is expected to improve gradually with the easing of the sluggishness in exports. GDP growth for the year is expected to be generally consistent with the August forecast of 1.4%. However, uncertainties surrounding the economic outlook are judged to be elevated, affected by heightened geopolitical risks and by the prolongation of restrictive monetary policy stances in major countries. Consumer price inflation has risen from August to 3.7% in September, due to the increase in the price of energy and of agricultural products. However, both core inflation (excluding changes in food and energy prices from the CPI) and short-term inflation expectations among the general public have stayed at 3.3% in September, the same as in August. Looking ahead, it is forecast that consumer price inflation will fall to the lower-3% range at the end of this year and will continue to gradually moderate in 2024. However, upside risks to inflation have increased due to the effects of higher global oil prices and exchange rates, and due to the Israel-Hamas conflict. Accordingly, it is judged that the timing of consumer price inflation converging on the target level is more likely to be delayed than previously expected. Meanwhile, core inflation is also projected to maintain its underlying slowing trend, owing to the weakening of demand-side pressures. However, the pace of the slowdown is likely to be more modest than previously forecast due to the continuing spillover effects of accumulated cost pressure. In financial and foreign exchange markets, volatility has increased as the U.S. Federal Reserve has signaled a prolongation of a high policy rate and as geopolitical risks have expanded. Long-term Korean Treasury bond yields and the Korean won to U.S. dollar exchange rate have risen significantly and stock prices have fallen. Meanwhile, the risks to some non-bank financial sectors have eased. Housing prices have continued their upward trend, especially in Seoul and its surrounding areas. Household loans have continued to increase, mainly driven by housing-related loans. The Board will continue to conduct monetary policy in order to stabilize consumer price inflation at the target level over the medium-term horizon as it monitors economic growth, while paying attention to financial stability. While domestic economic growth is forecast to gradually improve, uncertainties surrounding the policy decision have also risen. The Board, therefore, will maintain a restrictive policy stance for a considerable time with an emphasis on ensuring price stability, while making a judgement regarding the need to raise the Base Rate further. In this process, the Board will thoroughly assess the inflation slowdown, financial stability risks, economic downside risks, monetary policy changes in major countries, household debt growth, and developments in geopolitical risks."
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Perform sentiment analysis
scores = analyzer.polarity_scores(sample_text)

print(scores)

{'neg': 0.077, 'neu': 0.828, 'pos': 0.095, 'compound': 0.9534}


In [118]:
import datetime as dt
import sys

def load_masterdictionary(file_path, print_flag=False, f_log=None, get_other=False):
    start_local = dt.datetime.now()
    # Setup dictionaries
    _master_dictionary = {}

    # Loop through words and load dictionaries
    with open(file_path, encoding="utf-8") as f:
        _total_documents = 0
        _md_header = f.readline()  # Consume header line
        print()
        for line in f:
            cols = line.rstrip('\n').split(',')
            word = cols[0]
            uncertainty_value = int(cols[9])
            _master_dictionary[word] = uncertainty_value

    if print_flag:
        print('\r', end='')  # clear line
        print(f'\nMaster Dictionary loaded from file:\n  {file_path}\n')
        print(f'  master_dictionary has {len(_master_dictionary):,} words.\n')

    if get_other:
        return _master_dictionary, _md_header, _total_documents
    else:
        return _master_dictionary

if __name__ == '__main__':
    start = dt.datetime.now()
    print(f'\n\n{start.strftime("%c")}\nPROGRAM NAME: {sys.argv[0]}\n')
    md = r'd.csv'  # Update with your file path
    master_dictionary = load_masterdictionary(md, True)

    # Example word to check
    word_to_check = 'UNCERTAINTY'  # Replace with the word you want to check
    if word_to_check in master_dictionary:
        uncertainty_value = master_dictionary[word_to_check]
        if uncertainty_value == 0:
            print(f"The word '{word_to_check}' is not classified as an uncertain word.")
        else:
            print(f"The word '{word_to_check}' is classified as an uncertain word since the year {uncertainty_value}.")
    else:
        print(f"The word '{word_to_check}' is not found in the dictionary.")

    print(f'\nRuntime: {(dt.datetime.now()-start)}')
    print(f'\nNormal termination.\n{dt.datetime.now().strftime("%c")}\n')



Tue Oct 24 15:40:47 2023
PROGRAM NAME: /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py



Master Dictionary loaded from file:
  d.csv

  master_dictionary has 86,531 words.

The word 'UNCERTAINTY' is classified as an uncertain word since the year 2009.

Runtime: 0:00:00.218213

Normal termination.
Tue Oct 24 15:40:47 2023



In [123]:
def load_masterdictionary(file_path, print_flag=False, f_log=None, get_other=False):
    start_local = dt.datetime.now()
    # Setup dictionaries
    _master_dictionary = {}

    # Loop through words and load dictionaries
    with open(file_path, encoding="utf-8") as f:
        _total_documents = 0
        _md_header = f.readline()  # Consume header line
        print()
        for line in f:
            cols = line.rstrip('\n').split(',')
            word = cols[0]
            uncertainty_value = int(cols[9])
            _master_dictionary[word] = uncertainty_value

    if print_flag:
        print('\r', end='')  # clear line
        print(f'\nMaster Dictionary loaded from file:\n  {file_path}\n')
        print(f'  master_dictionary has {len(_master_dictionary):,} words.\n')

    if get_other:
        return _master_dictionary, _md_header, _total_documents
    else:
        return _master_dictionary

def evaluate_uncertainty_index(text, master_dictionary):
    words = text.split()
    uncertain_words_count = 0
    for word in words:
        if word in master_dictionary and master_dictionary[word] != 0:
            uncertain_words_count += 1
    uncertainty_index = uncertain_words_count / len(words)  # Calculate the uncertainty index
    return uncertainty_index

if __name__ == '__main__':
    start = dt.datetime.now()
    print(f'\n\n{start.strftime("%c")}\nPROGRAM NAME: {sys.argv[0]}\n')
    md = r'd.csv'  # Update with your file path
    master_dictionary = load_masterdictionary(md, True)

    # Example FOMC statement to evaluate
    fomc_statement = "The committee is uncertain about the economic outlook due to the recent market fluctuations."

    uncertainty_index = evaluate_uncertainty_index(fomc_statement, master_dictionary)
    print(f"\nUncertainty index for the FOMC statement: {uncertainty_index:.2f}")

    print(f'\nRuntime: {(dt.datetime.now()-start)}')
    print(f'\nNormal termination.\n{dt.datetime.now().strftime("%c")}\n')



Tue Oct 24 15:46:13 2023
PROGRAM NAME: /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py



Master Dictionary loaded from file:
  d.csv

  master_dictionary has 86,531 words.


Uncertainty index for the FOMC statement: 0.00

Runtime: 0:00:00.259261

Normal termination.
Tue Oct 24 15:46:13 2023



In [128]:
import datetime as dt
import sys
import nltk
nltk.download('punkt')

def load_masterdictionary(file_path, print_flag=False, f_log=None, get_other=False):
    start_local = dt.datetime.now()
    # Setup dictionaries
    _master_dictionary = {}

    # Loop through words and load dictionaries
    with open(file_path, encoding="utf-8") as f:
        _total_documents = 0
        _md_header = f.readline()  # Consume header line
        print()
        for line in f:
            cols = line.rstrip('\n').split(',')
            word = cols[0]
            uncertainty_value = int(cols[9])
            _master_dictionary[word] = uncertainty_value

    if print_flag:
        print('\r', end='')  # clear line
        print(f'\nMaster Dictionary loaded from file:\n  {file_path}\n')
        print(f'  master_dictionary has {len(_master_dictionary):,} words.\n')

    if get_other:
        return _master_dictionary, _md_header, _total_documents
    else:
        return _master_dictionary

def evaluate_uncertainty_index(text, master_dictionary):
    words = nltk.word_tokenize(text)  # Tokenize the text into words
    uncertain_words_count = 0
    for word in words:
        if word in master_dictionary and master_dictionary[word] != 0:
            uncertain_words_count += 1
    uncertainty_index = uncertain_words_count / len(words)  # Calculate the uncertainty index
    return uncertainty_index

if __name__ == '__main__':
    start = dt.datetime.now()
    print(f'\n\n{start.strftime("%c")}\nPROGRAM NAME: {sys.argv[0]}\n')
    md = r'd.csv'  # Update with your file path
    master_dictionary = load_masterdictionary(md, True)

    # Example FOMC statement to evaluate
    fomc_statement = "The committee is uncertain about the economic outlook due to the recent market fluctuations."

    uncertainty_index = evaluate_uncertainty_index(fomc_statement, master_dictionary)
    print(f"\nUncertainty index for the FOMC statement: {uncertainty_index:.2f}")

    print(f'\nRuntime: {(dt.datetime.now()-start)}')
    print(f'\nNormal termination.\n{dt.datetime.now().strftime("%c")}\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




Tue Oct 24 15:48:18 2023
PROGRAM NAME: /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py



Master Dictionary loaded from file:
  d.csv

  master_dictionary has 86,531 words.


Uncertainty index for the FOMC statement: 0.00

Runtime: 0:00:00.267558

Normal termination.
Tue Oct 24 15:48:18 2023



In [140]:
import datetime as dt
import sys
import nltk
nltk.download('punkt')

def load_masterdictionary(file_path, print_flag=False, f_log=None, get_other=False):
    start_local = dt.datetime.now()
    # Setup dictionaries
    _master_dictionary = {}

    # Loop through words and load dictionaries
    with open(file_path, encoding="utf-8") as f:
        _total_documents = 0
        _md_header = f.readline()  # Consume header line
        print()
        for line in f:
            cols = line.rstrip('\n').split(',')
            word = cols[0].upper()  # Convert word to uppercase
            negative_score = int(cols[8])
            positive_score = int(cols[9])
            uncertainty_value = int(cols[9])
            _master_dictionary[word] = {'negative': negative_score, 'positive': positive_score, 'uncertainty': uncertainty_value}

    if print_flag:
        print('\r', end='')  # clear line
        print(f'\nMaster Dictionary loaded from file:\n  {file_path}\n')
        print(f'  master_dictionary has {len(_master_dictionary):,} words.\n')

    if get_other:
        return _master_dictionary, _md_header, _total_documents
    else:
        return _master_dictionary

def evaluate_sentiment_indices(text, master_dictionary):
    words = nltk.word_tokenize(text)  # Tokenize the text into words
    words_uppercase = [word.upper() for word in words]  # Convert all words to uppercase
    negative_score, positive_score, uncertain_words_count = 0, 0, 0
    negative_words_count, positive_words_count = 0, 0
    for word in words_uppercase:
        if word in master_dictionary:
            if master_dictionary[word]['uncertainty'] != 0:
                uncertain_words_count += 1
            negative_score += master_dictionary[word]['negative']
            positive_score += master_dictionary[word]['positive']
            if master_dictionary[word]['negative'] > 0:
                negative_words_count += 1
            if master_dictionary[word]['positive'] > 0:
                positive_words_count += 1

    negative_index = negative_score / len(words)  # Calculate the negative index
    positive_index = positive_score / len(words)  # Calculate the positive index
    uncertainty_index = uncertain_words_count / len(words)  # Calculate the uncertainty index

    return negative_index, positive_index, uncertainty_index, uncertain_words_count, negative_words_count, positive_words_count

if __name__ == '__main__':
    start = dt.datetime.now()
#    print(f'\n\n{start.strftime("%c")}\nPROGRAM NAME: {sys.argv[0]}\n')
    md = r'd.csv'  # Update with your file path
    master_dictionary = load_masterdictionary(md, True)

    # Example FOMC statement to evaluate
    fomc_statement = "The Monetary Policy Board of the Bank of Korea decided today to leave the Base Rate unchanged at 3.50% for the intermeeting period. It is forecast that inflation will remain above the target level for a considerable time although it is projected to continue to slow. The Board, therefore, sees that it is appropriate to maintain its current restrictive policy stance. Regarding the need to raise the Base Rate further, the Board will make a judgement while assessing the changes in domestic and external policy conditions. The currently available information suggests that global economic growth has been more favorable than expected, but growth is projected to gradually slow due to the restrictive monetary policy stance being sustained in major countries and due to the contraction in bank credit supply. Global inflation still remains high, while continuing its slowdown, and core inflation is declining at a relatively slow pace. In global financial markets, the U.S. dollar initially weakened as the U.S. Federal Reserve signaled a potential end to rate hikes, but then it has fluctuated since mid-May affected by economic indicators exceeding market expectations and by developments in U.S. debt ceiling negotiations. Long-term government bond yields in major countries have risen after having fluctuated within a narrow range. Looking ahead, the Board sees global economic growth and global financial markets as likely to be affected by the pace of global inflation slowdown, monetary policy changes in major countries, U.S. dollar trends, risks to small and medium-sized U.S. banks, debt ceiling negotiations in the U.S., and the recovery in the Chinese economy. Domestic economic growth has continued to slow, with ongoing sluggishness of exports and investment, although private consumption has shown a modest recovery led by services. Labor market conditions have generally continued to be favorable, but the increase in the number of persons employed has declined due to the economic slowdown. Going forward, domestic economic growth is expected to remain weak for some time. From the second half of this year, however, it is expected to recover gradually with an easing of the sluggishness in the IT industry and the impact of the Chinese economic recovery. GDP growth for this year is projected to be 1.4%, lower than the February forecast of 1.6%, but uncertainties regarding the timing of a rebound in the IT industry, the domestic impact of the recovery in the Chinese economy, and economic growth in major advanced countries are all judged to be high. Consumer price inflation has continued to moderate as expected, declining from 4.2% in March to 3.7% in April. This is mainly because the decline in the price of petroleum products has widened and the rise in the prices of processed food products has weakened. Core inflation (excluding changes in food and energy prices from the CPI) has stayed at 4.0%, and short-term inflation expectations among the general public have moved down to 3.5% in May. Looking ahead, it is forecast that consumer price inflation will fall considerably owing to the base effect from the sharp rises in global oil prices last year, and then will rise slightly and fluctuate at around the 3% level until the end of this year. Consumer price inflation for this year is expected to be consistent with the February forecast of 3.5%. Meanwhile, it is judged that the pace of core inflation slowdown is likely to be more modest than previously forecast due to accumulated cost pressure and favorable demand in services. Core inflation is projected to be 3.3%, which is higher than the February forecast of 3.0%. The inflation path is likely to be affected by movements of global oil prices and exchange rates, the degree of economic slowdown at home and abroad, and any further increase in public utility fees. In financial and foreign exchange markets, the Korean won to U.S. dollar exchange rate has fluctuated considerably due to trends in the trade balance, expectations of an end to policy rate hikes by the U.S. Federal Reserve, and negotiations on the U.S. debt ceiling. Long-term Korean Treasury bond yields have shown a modest increase, influenced by the movements of government bond yields in major countries. Household loans have slightly increased and the extent of the decline in housing prices has narrowed. The Board will continue to conduct monetary policy in order to stabilize consumer price inflation at the target level over the medium-term horizon as it monitors economic growth, while paying attention to financial stability. Domestic economic growth is expected to remain low, but inflation is projected to remain above the target level for a considerable time. Moreover, uncertainties surrounding the policy decision are judged to be high. The Board, therefore, will maintain a restrictive policy stance for a considerable time with an emphasis on ensuring price stability. Regarding the need to raise the Base Rate further, the Board will make a judgement while thoroughly assessing the pace of inflation slowdown, the economic downside risks and financial stability risks, the effects of the Base Rate raises, and monetary policy changes in major countries."

    negative_index, positive_index, uncertainty_index, uncertain_words_count, negative_words_count, positive_words_count = evaluate_sentiment_indices(fomc_statement, master_dictionary)
    print(f"\nNumber of uncertain words: {uncertain_words_count}")
    print(f"Number of negative words: {negative_words_count}")
    print(f"Number of positive words: {positive_words_count}")
    print(f"Negative index for the FOMC statement: {negative_index:.2f}")
    print(f"Positive index for the FOMC statement: {positive_index:.2f}")
    print(f"Uncertainty index for the FOMC statement: {uncertainty_index:.2f}")

#    print(f'\nRuntime: {(dt.datetime.now()-start)}')
#    print(f'\nNormal termination.\n{dt.datetime.now().strftime("%c")}\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




Tue Oct 24 16:01:37 2023
PROGRAM NAME: /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py



Master Dictionary loaded from file:
  d.csv

  master_dictionary has 86,531 words.


Number of uncertain words: 11
Number of negative words: 8
Number of positive words: 11
Negative index for the FOMC statement: 17.38
Positive index for the FOMC statement: 23.89
Uncertainty index for the FOMC statement: 0.01

Runtime: 0:00:00.429868

Normal termination.
Tue Oct 24 16:01:37 2023



In [126]:
words(head)

NameError: ignored

In [None]:
# 마스크 인풋
valid_num = len(tokenizer.encode("전율을 일으키는 영화. 다시 보고싶은 영화")

In [None]:
def convert_data(data_df):
    global tokenizer

    SEQ_LEN = 64 #SEQ_LEN : 버트에 들어갈 인풋의 길이

    tokens, masks, segments, targets = [], [], [], []

    for i in tqdm(range(len(data_df))):
        # token : 문장을 토큰화함
        token = tokenizer.encode(data_df[DATA_COLUMN][i], truncation=True, padding='max_length', max_length=SEQ_LEN)

        # 마스크는 토큰화한 문장에서 패딩이 아닌 부분은 1, 패딩인 부분은 0으로 통일
        num_zeros = token.count(0)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros

        # 문장의 전후관계를 구분해주는 세그먼트는 문장이 1개밖에 없으므로 모두 0
        segment = [0]*SEQ_LEN

        # 버트 인풋으로 들어가는 token, mask, segment를 tokens, segments에 각각 저장
        tokens.append(token)
        masks.append(mask)
        segments.append(segment)

        # 정답(긍정 : 1 부정 0)을 targets 변수에 저장해 줌
        targets.append(data_df[LABEL_COLUMN][i])

    # tokens, masks, segments, 정답 변수 targets를 numpy array로 지정
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments], targets

# 위에 정의한 convert_data 함수를 불러오는 함수를 정의
def load_data(pandas_dataframe):
    data_df = pandas_dataframe
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    data_df[LABEL_COLUMN] = data_df[LABEL_COLUMN].astype(int)
    data_x, data_y = convert_data(data_df)
    return data_x, data_y

SEQ_LEN = 64
BATCH_SIZE = 32
# 긍부정 문장을 포함하고 있는 칼럼
DATA_COLUMN = "document"
# 긍정인지 부정인지를 (1=긍정,0=부정) 포함하고 있는 칼럼
LABEL_COLUMN = "label"

# train 데이터를 버트 인풋에 맞게 변환
train_x, train_y = load_data(train)

In [None]:
# 훈련 성능을 검증한 test 데이터를 버트 인풋에 맞게 변환
test_x, test_y = load_data(test)

In [None]:
model = TFBertModel.from_pretrained("monologg/kobert", from_pt=True)
# 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
# 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

In [None]:
bert_outputs

In [None]:
bert_outputs = bert_outputs[1]

In [None]:
# Rectified Adam 옵티마이저 사용
!pip install tensorflow_addons
import tensorflow_addons as tfa
# 총 batch size * 4 epoch = 2344 * 4
opt = tfa.optimizers.RectifiedAdam(lr=5.0e-5, total_steps = 2344*2, warmup_proportion=0.1, min_lr=1e-5, epsilon=1e-08, clipnorm=1.0)


In [None]:
sentiment_drop = tf.keras.layers.Dropout(0.5)(bert_outputs)
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(sentiment_drop)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)
sentiment_model.compile(optimizer=opt, loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])

In [None]:
sentiment_model.summary()


In [None]:
sentiment_model.fit(train_x, train_y, epochs=2, shuffle=True, batch_size=64, validation_data=(test_x, test_y))

In [None]:
def predict_convert_data(data_df):
    global tokenizer
    tokens, masks, segments = [], [], []

    for i in tqdm(range(len(data_df))):

        token = tokenizer.encode(data_df[DATA_COLUMN][i], max_length=SEQ_LEN, truncation=True, padding='max_length')
        num_zeros = token.count(0)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
        segment = [0]*SEQ_LEN

        tokens.append(token)
        segments.append(segment)
        masks.append(mask)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    return [tokens, masks, segments]

# 위에 정의한 convert_data 함수를 불러오는 함수를 정의
def predict_load_data(pandas_dataframe):
    data_df = pandas_dataframe
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    data_x = predict_convert_data(data_df)
    return data_x

In [None]:
test_set = predict_load_data(test)

In [None]:
test_set


In [None]:
preds = sentiment_model.predict(test_set)


In [None]:
# 부정이면 0, 긍정이면 1 출력
preds

In [None]:
from sklearn.metrics import classification_report
y_true = test['label']
# F1 Score 확인
print(classification_report(y_true, np.round(preds,0)))

In [None]:
import logging
tf.get_logger().setLevel(logging.ERROR)

In [None]:
def sentence_convert_data(data):
    global tokenizer
    tokens, masks, segments = [], [], []
    token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')

    num_zeros = token.count(0)
    mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
    segment = [0]*SEQ_LEN

    tokens.append(token)
    segments.append(segment)
    masks.append(mask)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    return [tokens, masks, segments]

def movie_evaluation_predict(sentence):
    data_x = sentence_convert_data(sentence)
    predict = sentiment_model.predict(data_x)
    predict_value = np.ravel(predict)
    predict_answer = np.round(predict_value,0).item()

    if predict_answer == 0:
      print("(부정 확률 : %.2f) 부정적인 영화 평가입니다." % (1-predict_value))
    elif predict_answer == 1:
      print("(긍정 확률 : %.2f) 긍정적인 영화 평가입니다." % predict_value)

In [None]:
movie_evaluation_predict("보던거라 계속보고있는데 전개도 느리고 주인공인 은희는 한두컷 나오면서 소극적인모습에 ")