In [2]:
import pandas as pd
import json
import numpy as np

from transformers import BertJapaneseTokenizer, BertForSequenceClassification, BertConfig
import torch
from torch.optim import AdamW
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


  from .autonotebook import tqdm as notebook_tqdm


cuda:0


In [3]:

def make_csv(path):
    # JSONファイルを読み込む
    with open('./train_data/'+path+'.json', 'r', encoding='utf-8') as file:
        data = json.load(file)

    # データをDataFrameに変換
    columns = ['text', 'despair', 'optimism', 'concern', 'excitement', 'stability']
    df = pd.DataFrame(columns=columns)

    for item in data:
        row = [item['text']] + item['labels']
        df.loc[len(df)] = row

    # CSVファイルに保存
    df.to_csv('./train_data/'+path+'.csv', index=False)

    print("Succesfully :", './train_data/'+path+'.csv')


make_csv('train')
make_csv('test')

Succesfully : ./train_data/train.csv
Succesfully : ./train_data/test.csv


In [4]:
df_train = pd.read_csv("./train_data/train.csv")

text_train = df_train.text.values
labels_train = df_train[df_train.columns[1:]]
labels_train

Unnamed: 0,despair,optimism,concern,excitement,stability
0,0.7,0.2,0.8,0.1,0.2
1,0.1,0.9,0.2,0.8,0.7
2,0.2,0.6,0.6,0.9,0.5
3,0.0,0.9,0.1,0.7,0.8
4,0.3,0.7,0.7,0.6,0.4
...,...,...,...,...,...
1411,0.2,0.8,0.4,0.7,0.7
1412,0.3,0.8,0.4,0.7,0.6
1413,0.3,0.8,0.4,0.7,0.6
1414,0.9,0.1,0.8,0.1,0.2


In [5]:
df_test = pd.read_csv('./train_data/test.csv')

text_test = df_test.text.values
labels_test = df_test[df_test.columns[1:]]
labels_test

Unnamed: 0,despair,optimism,concern,excitement,stability
0,0.2,0.8,0.4,0.7,0.6
1,0.3,0.7,0.5,0.6,0.6
2,0.4,0.6,0.7,0.5,0.6
3,0.2,0.8,0.3,0.7,0.6
4,0.3,0.7,0.5,0.6,0.6
...,...,...,...,...,...
125,0.3,0.7,0.4,0.6,0.7
126,0.2,0.8,0.3,0.7,0.7
127,0.3,0.7,0.5,0.6,0.6
128,0.2,0.8,0.3,0.7,0.7


In [6]:
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [7]:
print(text_train[0])
print(tokenizer.tokenize(text_train[0]))
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_train[0])))

ソニーグループ(6758.T)の株価が、期待を下回る四半期利益報告後に10%下落。市場の不安定さを反映し、業績見通しの下方修正が発表された。年内の売上予測が前年比で5%減少すると予想されている。
['ソニー', 'グループ', '(', '67', '##5', '##8', '.', 'T', ')', 'の', '株価', 'が', '、', '期待', 'を', '下回る', '四', '##半期', '利益', '報告', '後', 'に', '10', '%', '下落', '。', '市場', 'の', '不安定', 'さ', 'を', '反映', 'し', '、', '業績', '見通し', 'の', '下方', '修正', 'が', '発表', 'さ', 'れ', 'た', '。', '年内', 'の', '売上', '予測', 'が', '前年', '比', 'で', '5', '%', '減少', 'する', 'と', '予想', 'さ', 'れ', 'て', 'いる', '。']
[6369, 1091, 23, 5815, 28498, 28501, 143, 260, 24, 5, 17059, 14, 6, 3252, 11, 19997, 755, 17555, 5161, 1888, 83, 7, 121, 648, 18369, 8, 2304, 5, 8499, 26, 11, 6256, 15, 6, 6624, 15887, 5, 20659, 4971, 14, 602, 26, 20, 10, 8, 26334, 5, 6446, 7055, 14, 3065, 701, 12, 76, 648, 2643, 34, 13, 4663, 26, 20, 16, 33, 8]


In [8]:
print('max train len : ', max(len(tokenizer.tokenize(t)) for t in text_train))
print('max test len : ', max(len(tokenizer.tokenize(t)) for t in text_test))
max_len = int(max(max(len(tokenizer.tokenize(t)) for t in text_test), max(len(tokenizer.tokenize(t)) for t in text_train))) + 2
print(max_len)

max train len :  186
max test len :  192
194


In [9]:

input_ids_train = []
attention_masks_train = []
input_ids_test = []
attention_masks_test = []
max_len = int(max(max(len(tokenizer.tokenize(t)) for t in text_test), max(len(tokenizer.tokenize(t)) for t in text_train))) + 2

def mask_ids(text, input_ids, attention_masks):
    for t in text:
        encoded_dict = tokenizer(
            t,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        
mask_ids(text_train, input_ids_train, attention_masks_train)
mask_ids(text_test, input_ids_test, attention_masks_test)

# リストのTensorを連結
input_ids_train = torch.cat(input_ids_train, dim=0)
input_ids_test = torch.cat(input_ids_test, dim=0)

attention_masks_train = torch.cat(attention_masks_train, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)

labels_train = torch.tensor(np.array(labels_train))
labels_test = torch.tensor(np.array(labels_test))


# 最初のテキストとその入力IDを印刷
print(text_train[0])
print(input_ids_train[0])
print(labels_train[0])

ソニーグループ(6758.T)の株価が、期待を下回る四半期利益報告後に10%下落。市場の不安定さを反映し、業績見通しの下方修正が発表された。年内の売上予測が前年比で5%減少すると予想されている。
tensor([    2,  6369,  1091,    23,  5815, 28498, 28501,   143,   260,    24,
            5, 17059,    14,     6,  3252,    11, 19997,   755, 17555,  5161,
         1888,    83,     7,   121,   648, 18369,     8,  2304,     5,  8499,
           26,    11,  6256,    15,     6,  6624, 15887,     5, 20659,  4971,
           14,   602,    26,    20,    10,     8, 26334,     5,  6446,  7055,
           14,  3065,   701,    12,    76,   648,  2643,    34,    13,  4663,
           26,    20,    16,    33,     8,     3,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,  

In [10]:
train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)


train_size = int(0.9 * len(train_dataset))
valid_size = len(train_dataset) - train_size

train_dataset, valid_dataset = random_split(train_dataset, [train_size, valid_size])

print('train size', train_size)
print('valid size ', valid_size)

batch_size = 32

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

valid_dataloader = DataLoader(
    valid_dataset,
    sampler=SequentialSampler(valid_dataset),
    batch_size=batch_size
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)

train size 1274
valid size  142


In [11]:
model = BertForSequenceClassification.from_pretrained(
    'cl-tohoku/bert-base-japanese-whole-word-masking',
    num_labels = 5,
    output_attentions = False,
    output_hidden_states = False
)

model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
def calculate_accuracy(model, dataloader, device):
    model.eval()  # モデルを評価モードに設定
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # モデルからロジットを取得
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels
            )
            logits = outputs.logits
            
            # ソフトマックスを適用して確率を計算
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_labels = torch.argmax(probabilities, dim=1)
            true_labels = torch.argmax(b_labels, dim=1)

            # 正確な予測の数をカウント
            correct_predictions += (predicted_labels == true_labels).sum().item()
            total_predictions += b_labels.size(0)

    # 精度を計算
    accuracy = correct_predictions / total_predictions
    return accuracy


In [13]:
optimizer = AdamW(model.parameters(), lr=5e-6)

def save_model(model, optimizer, accuracy, file_path="./model/"):
    accuracy = f"{accuracy:.4f}".replace('.', '_')
    full_path = f"{file_path}bert_{accuracy}.pt"
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, full_path)
    print(f"Model saved to {full_path}")
    

def load_model(model, optimizer, file_path):
    checkpoint = torch.load(file_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Model loaded from {file_path}")

def train(model, optimizer):
    model.train()
    train_loss = 0
    
    for batch in train_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        
        outputs = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels
        )
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()
    
    return train_loss


def test(model):
    model.eval()
    test_loss = 0
    
    with torch.no_grad():
        for batch in valid_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels
            )
            loss = outputs.loss
            logits = outputs.logits
            test_loss += loss.item()
    
    return test_loss


In [37]:

max_epoch = 10
train_loss_ = []
test_loss_ = []

for epoch in range(max_epoch):
    train_ = train(model, optimizer)
    train_loss_.append(train_)
    
    if epoch == max_epoch - 1:  # Last epoch
        valid_accuracy = calculate_accuracy(model, valid_dataloader, device)
        save_model(model, optimizer, valid_accuracy)  # Save model with train accuracy in filename

accuracy = f"{valid_accuracy:.4f}".replace('.', '_')
load_model(model, optimizer, f'./model/bert_{accuracy}.pt')  # Load the model before testing
test_ = test(model)
test_loss_.append(test_)


# Calculate accuracy after loading the model
train_accuracy = calculate_accuracy(model, train_dataloader, device)
test_accuracy = calculate_accuracy(model, test_dataloader, device)



print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

KeyboardInterrupt: 

In [14]:
def predict_text_probabilities(model, tokenizer, text, device):
    # テキストをトークナイズしてテンソルに変換
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # モデルの評価モード
    model.eval()

    # 予測の実行
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # 確率をnumpy配列に変換して返す
    return probabilities.squeeze().cpu().numpy()

modelname_1 = './model/bert_0_9261.pt'
modelname_2 = './model/bert_0_9391.pt'

modelname = './model/bert_0_9567.pt'


# このモデルかなりい良い
load_model(model, optimizer, modelname)

# 使用例 ['text', 'despair', 'optimism', 'concern', 'excitement', 'stability'] 失望、楽観、懸念、興奮、安定
text_to_classify = "彼のプレゼンテーションは期待外れだった。"
probabilities = predict_text_probabilities(model, tokenizer, text_to_classify, device)
print(probabilities)



Model loaded from ./model/bert_0_9567.pt
[0.38150236 0.05312215 0.38559145 0.05664619 0.12313791]


In [15]:
text = '［ドバイ　２０日　ロイター］ -     イラン当局者は２０日、ライシ大統領とアブドラヒアン外相が、搭乗していたヘリコプター墜落で死亡したとロイターに述べた。'

predict = predict_text_probabilities(model, tokenizer, text, device)
print(predict)

[0.36090022 0.06583396 0.42395124 0.0684452  0.08086935]


In [16]:
text = '【速報】世界が注目するNVIDIA(エヌビディア)が決算発表「最終的な利益 前年比7.3倍2兆3300億円」勢い止まらず'

predict = predict_text_probabilities(model, tokenizer, text, device)
print(predict)

# ['text', 'despair', 'optimism', 'concern', 'excitement', 'stability']
# 失望、楽観、懸念、興奮、安定

[0.04141054 0.4326825  0.06948418 0.22149023 0.23493247]
