In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
origin_df = pd.read_csv('/content/drive/MyDrive/big_aivle/fault_ratio_crawling.csv')

print('Before Shape : ', origin_df.shape)

origin_df['chungu_bi'] = origin_df['chungu_bi']/100
origin_df['pi_bi'] = origin_df['pi_bi']/100

# Non Validation / Na 데이터 삭제

origin_df.dropna(inplace=True)
origin_df.drop(index=[2225,621,622,398], inplace=True)

origin_df.drop('pi_bi', axis=1, inplace = True)

print('After Shape : ', origin_df.shape)

# 데이터셋 shuffle

final_df = origin_df.sample(frac=1).reset_index(drop=True)
final_df = origin_df


train_df = final_df[:round(len(final_df)*0.8)]
val_df = final_df[round(len(final_df)*0.8):round(len(final_df)*0.9)]
test_df = final_df[round(len(final_df)*0.9):]

print('Train Shape : ', train_df.shape)

print('Validation Shape : ', val_df.shape)


print('Test Shape : ', test_df.shape)

Before Shape :  (2316, 3)
After Shape :  (2311, 2)
Train Shape :  (1849, 2)
Validation Shape :  (231, 2)
Test Shape :  (231, 2)


In [None]:
origin_df.isnull().sum()

content      0
chungu_bi    0
dtype: int64

In [None]:
# Train, Validation, Test 저장

train_df.to_csv('/content/drive/MyDrive/big_aivle/fault_ratio_train_df.csv',index = False)
val_df.to_csv('/content/drive/MyDrive/big_aivle/fault_ratio_val_df.csv',index = False)
test_df.to_csv('/content/drive/MyDrive/big_aivle/fault_ratio_test_df.csv',index = False)

In [None]:
#필요한 패키지 설치

!pip install transformers datasets
!pip install evaluate
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
!pip install sentencepiece

!pip install accelerate==0.20.1

In [None]:
from datasets import load_dataset

# 학습을 위한 데이터 셋 구성

data_path = {'train' : '/content/drive/MyDrive/big_aivle/train_df.csv' , 'validation' : '/content/drive/MyDrive/big_aivle/val_df.csv', 'test' : '/content/drive/MyDrive/big_aivle/test_df.csv'}

dataset = load_dataset('csv', data_files=data_path) #

print(dataset)


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-246302159bcec47c/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-246302159bcec47c/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['content', 'chungu_bi'],
        num_rows: 1849
    })
    validation: Dataset({
        features: ['content', 'chungu_bi'],
        num_rows: 231
    })
    test: Dataset({
        features: ['content', 'chungu_bi'],
        num_rows: 231
    })
})


In [None]:
# Tokenizer 선언 및 관련 설정 변경
from kobert_tokenizer import KoBERTTokenizer
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

def tokenize_function(examples):
    return tokenizer(examples["content"],max_length = 512, padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("chungu_bi", "labels") # label이 아니고 labels ㅋㅋㅋ

Map:   0%|          | 0/1849 [00:00<?, ? examples/s]

Map:   0%|          | 0/231 [00:00<?, ? examples/s]

Map:   0%|          | 0/231 [00:00<?, ? examples/s]

In [None]:
# 학습용 변수(Hyper-parameter) 선언

from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=50,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=10,   # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.005,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,               # How often to print logs
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after eachh epoch
    gradient_accumulation_steps=64,  # total number of steps before back propagation
    fp16=True,                       # Use mixed precision - cuda만 가능
    fp16_opt_level="02",             # mixed precision mode
    run_name="big_project",       # experiment name
    seed=71                           # Seed for experiment reproducibility 3x3
)

In [None]:
# 학습용 Metric 설정

import evaluate
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [None]:
# 학습을 위한 모델 및 변수 설정

import torch
from transformers import BertModel,  BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('skt/kobert-base-v1',num_labels=1)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics_for_regression,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 학습

trainer.train()



In [None]:
# 평가

trainer.evaluate()

{'eval_loss': 0.07656465470790863,
 'eval_mse': 0.07656465470790863,
 'eval_rmse': 0.2767031788825989,
 'eval_mae': 0.21590591967105865,
 'eval_r2': 0.01901749711655376,
 'eval_smape': 57.3897329883658,
 'eval_runtime': 0.758,
 'eval_samples_per_second': 304.737,
 'eval_steps_per_second': 31.661,
 'epoch': 48.46}

In [None]:
# log 및 모델 저장

import pickle

with open("/content/drive/MyDrive/big_aivle/log_baseline_0627.pickle","wb") as fw:
    pickle.dump(trainer.state.log_history, fw)

trainer.save_model('/content/drive/MyDrive/big_aivle/aivle_model_0627_1024_shuffle')


# 학습된 모델 불러오기

In [None]:
from transformers import BertForSequenceClassification
import torch

trained_model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/big_aivle/aivle_model_shuffle")

In [None]:
# Test data mse 구하기
total_n = 231
import numpy as np

lst = []
inputs = tokenized_datasets['test']

for i in range(len(test_df)):
    inputs_cur = inputs[i:i+1]
    out = trained_model(input_ids = torch.tensor(inputs_cur['input_ids']), attention_mask = torch.tensor(inputs_cur['attention_mask']))

    lst.append(out['logits'].detach().numpy()[0][0])

    del out
    del inputs_cur


del inputs


print('RMSE of Test : ', (np.mean((np.array(lst) - test_df[['chungu_bi']][0:231].to_numpy().reshape(1,-1)[0])**2))**(1/2))
