## 모델 로드 및 실행 노트북

In [4]:
"""main func for training"""
import os
import re
import time
import random
import pprint
import datetime

import ujson
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader

In [5]:
import config as cfg

import model as m
import utils
import dataset

I0511 14:42:47.740202 139623672854272 file_utils.py:41] PyTorch version 1.4.0 available.


## config
- INPUT_BASE는 dev42.csv 를 가지고 있음
    - dev42.csv는 mkdev.py를 통해 생성된 파일이며, 다 귀찮고 결과 파일이 필요할 경우 그에게 문의

In [13]:
cfg.MODEL_TYPE, cfg.MAX_SEQ_LEN

('roberta', 160)

In [18]:
os.listdir(cfg.INPUT_BASE)

['train42.csv',
 'test.csv',
 'train_folds.csv.zip',
 'tweet-sentiment-extraction.zip',
 'dev42.csv',
 'train_folds.csv',
 'train.csv',
 'train_folds_v2_num2zero.csv',
 'sample_submission.csv',
 '.ipynb_checkpoints']

## Global params
- MODEL_PATH ; 학습 weight 경로 (model_x.pt)

In [21]:
GPU_ID = 0
MODEL_PATH = '/DATA/image-search/kgg/best/normal_29105s_160t_16b_3e-5lr_roberta'

VERBOSE = True

In [15]:
os.listdir(MODEL_PATH)

['dataset-metadata.json',
 'model_1.pt',
 'config.py',
 'res_29105_roberta_score_0.7123.json',
 'model_4.pt',
 'model_2.pt',
 'model_3.pt',
 'model_0.pt']

In [10]:
if torch.cuda.is_available():
    device = torch.device('cuda:{}'.format(GPU_ID))
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(GPU_ID))
else:
    print('no gpus available')
    device = torch.device('cpu')

2
Tesla P100-PCIE-16GB


## Test
- 폴더명에서 메타정보 추출

In [20]:
max_seq_len = int(MODEL_PATH.split('t_')[-2].split('_')[-1])

model_path = MODEL_PATH

['//DATA/image-search/kgg/best/normal_29105s_160t_16b_3e-5lr_roberta']

In [25]:
finished = sum([1 for path in os.listdir(model_path) if 'res' in path]) > 0
if not finished:
    print('not finished yet', model_path)

if finished:
    for path in os.listdir(model_path):
        if 'res' in path:
            print('CV -> ', path)

CV ->  res_29105_roberta_score_0.7123.json


### Model 로드 (K-fold)

In [26]:
models = []

print('Cur model path:', model_path)

root = cfg.INPUT_BASE
test_data = pd.read_csv('{}dev42.csv'.format(root))

test_data['text'] = test_data.apply(lambda row: str(row.text).strip(), axis=1)

electra = cfg.MODEL_STR[cfg.ELECTRA]
roberta = cfg.MODEL_STR[cfg.ROBERTA]
if electra in model_path:
    model_type = electra
elif roberta in model_path:
    model_type = roberta

for i in range(cfg.K_FOLD):
    if 'normal' in model_path:
        print('normal model')
        _model = m.SentimentExtractor(model=model_type, device=device)
    elif 'cnn' in model_path:
        print('cnn model')
        _model = m.SentimentExtractorCNN(model=model_type, device=device)
    _model.to(device)
    _model.load_state_dict(torch.load(f'{model_path}/model_{i}.pt'))
    _model.eval()
    models.append(_model)

I0511 14:59:10.858190 139623672854272 configuration_utils.py:284] loading configuration file ./roberta/roberta-base-config.json
I0511 14:59:10.859737 139623672854272 configuration_utils.py:322] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "nu

Cur model path: //DATA/image-search/kgg/best/normal_29105s_160t_16b_3e-5lr_roberta
normal model


I0511 14:59:16.418360 139623672854272 configuration_utils.py:284] loading configuration file ./roberta/roberta-base-config.json
I0511 14:59:16.419954 139623672854272 configuration_utils.py:322] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "nu

normal model


I0511 14:59:21.848900 139623672854272 configuration_utils.py:284] loading configuration file ./roberta/roberta-base-config.json
I0511 14:59:21.850666 139623672854272 configuration_utils.py:322] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "nu

normal model


I0511 14:59:27.681416 139623672854272 configuration_utils.py:284] loading configuration file ./roberta/roberta-base-config.json
I0511 14:59:27.683131 139623672854272 configuration_utils.py:322] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "nu

normal model


I0511 14:59:33.436320 139623672854272 configuration_utils.py:284] loading configuration file ./roberta/roberta-base-config.json
I0511 14:59:33.438283 139623672854272 configuration_utils.py:322] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "nu

normal model


### 토크나이저 사용전 초기화

In [27]:
m.init_tokenizer()

In [28]:
test_dataset = dataset.TweetDataset(
    tweet=test_data.text.values,
    sentiment=test_data.sentiment.values,
    selected_text=test_data.selected_text.values,
    tokenizer=m.tokenizer,
    max_seq_len=max_seq_len,
    model_type=model_type,
)

## Dev셋에 대한 스코어 구하기!
- VERBOSE True일 경우, 1.0 점 미만인 경우 출력
- 후처리 로직은 아래 로직에 추가하여, 스코어 상승 여부 확인 가능

In [31]:
id_list = []
answer = []
sentiments = ['positive', 'negative', 'neutral']

scores = []

with torch.no_grad():
    for idx, d in enumerate(test_dataset):

        if idx%100 == 0:
            print('[{}]'.format(idx))

        uniq_id =  test_data.textID.iloc[idx]
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = torch.unsqueeze(ids, dim=0).to(device, dtype=torch.long)
        token_type_ids = torch.unsqueeze(token_type_ids, dim=0).to(device, dtype=torch.long)
        mask = torch.unsqueeze(mask, dim=0).to(device, dtype=torch.long)

        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        c = [] # sentiment classification
        s = [] # start idx
        e = [] # end idx
        for _model in models:
            start_logits, end_logits, _ = _model(ids,
                token_type_ids=token_type_ids,
                attention_mask=mask)
            s.append(start_logits)
            e.append(end_logits)

        s_merged_logits = sum(s)/len(s)
        e_merged_logits = sum(e)/len(e)

        outputs_start = torch.softmax(s_merged_logits, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(e_merged_logits, dim=1).cpu().detach().numpy()

        idx_start = np.argmax(outputs_start[0, :])
        idx_end = np.argmax(outputs_end[0, :])

        score, output_sentence = utils.calculate_jaccard_score(
            original_tweet=orig_tweet,
            target_string=orig_selected,
            sentiment_val=sentiment,
            idx_start=idx_start,
            idx_end=idx_end,
            offsets=offsets
        )

        scores.append(score)

        if VERBOSE and sentiment != 'neutral' and score < 1.0:
            print()
            print(uniq_id, '({}, {})'.format(sentiment, score))
            print(orig_tweet)
            print('Answ:', orig_selected)
            print('Pred:', output_sentence)

        id_list.append(uniq_id)
        answer.append(output_sentence)
    print('=> avg score:', sum(scores)/len(scores))
    print('------------------------------------------------------')

[0]

3d9d4b0b55 (negative, 0.2222222222222222)
 i donbt like to peel prawns, i also dont like going shopping, running out of money and crawling round the car looking for more
Answ:  dont like go
Pred:  i donbt like to peel prawns, i also dont like

bffa3ddd61 (negative, 0.5)
 i miss you bby wish you were going tomorrow to make me do good.
Answ:  i miss you bby
Pred:  i miss

401869d615 (negative, 0.0)
 graduation is done im a little sad.. anyone want to hang out???
Answ:  sad.
Pred:  sad..

5f93cc70ff (negative, 0.2222222222222222)
 Bugger. forgot I still have washing in my machine
Answ:  Bugger. forgot I still have washing in my machine
Pred:  Bugger. forgot

e4e9b8713a (negative, 0.2)
 My back hurts...really bad
Answ:  .really bad
Pred:  My back hurts...really bad

9d3a1e0269 (negative, 0.25)
 really hopes her car`s illness is not terminal...
Answ:  illness
Pred:  illness is not terminal...

a0a306868a (negative, 0.0)
 lost my tooth 2day whilst i was eating gum...oww
Answ:  oww
Pred:

KeyboardInterrupt: 