In [19]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import urllib.request
import json

In [20]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/finance_sentiment_corpus/main/finance_data.csv", filename="finance_data.csv")
data = pd.read_csv('finance_data.csv')

In [21]:
data['labels'] = data['labels'].replace(['neutral', 'positive', 'negative'], [0, 1, 2])
data.drop_duplicates(subset=['kor_sentence'], inplace=True)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(data['kor_sentence'], data['labels'], test_size=0.2, random_state=0, stratify=data['labels'])

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def encode(texts, labels):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128, return_tensors='tf')
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    return dataset

train_dataset = encode(X_train, y_train).shuffle(len(X_train)).batch(16)
test_dataset = encode(X_test, y_test).batch(16)

In [24]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model.fit(train_dataset, validation_data=test_dataset, epochs=3, callbacks=[early_stop])

Epoch 1/3


KeyboardInterrupt: 

In [None]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

In [26]:
def predict_sent(text):
    tokenizer = BertTokenizer.from_pretrained('./saved_model')
    model = TFBertForSequenceClassification.from_pretrained('./saved_model')
    inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=128)
    outputs = model(inputs)
    logits = outputs.logits
    pred_class = tf.math.argmax(logits, axis=1).numpy()[0]

    label_map = {0: 'neutral', 1: 'positive', 2: 'negative'}
    sentiment = label_map[pred_class]
    
    result = {
        "text": text,
        "sentiment": sentiment
    }
    return json.dumps(result, ensure_ascii=False)

In [27]:
print(predict_sent("비트코인 하락"))

Some layers from the model checkpoint at ./saved_model were not used when initializing TFBertForSequenceClassification: ['dropout_151']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./saved_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


{"text": "비트코인 하락", "sentiment": "negative"}
