In [1]:
!pip install transformers
!pip install nlp

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.7MB/s 
[?25hCollecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 14.2MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 27.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AdamW

import nlp

  import pandas.util.testing as tm


In [4]:
def seed_set(seed):
    """
    seedの固定

    Parameters
    --------------
    seed : int
        シード
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # https://qiita.com/chat-flip/items/c2e983b7f30ef10b91f6
        torch.backends.cudnn.deterministic = True  # 決定論的な振る舞いをする
        torch.backends.cudnn.benchmark = False

In [5]:
if torch.cuda.is_available():
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))

Device: Tesla P100-PCIE-16GB


## Config

In [6]:
%cd /content/drive/My\ Drive/fueki/text/

/content/drive/My Drive/fueki/text


In [7]:
SEED = 0
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = "./dataset/train.csv"
TEST_FILE = "./dataset/test.csv"
MODELS_DIR = "./models/"
MODEL_NAME = 'bert-base-uncased'
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 128
NUM_CLASSES = 4
EPOCHS = 5
NUM_SPLITS = 5

In [8]:
seed_set(SEED)

## Dataset

In [21]:
df = pd.read_csv(TRAIN_FILE)
df.head()

Unnamed: 0,id,description,jobflag
0,0,"Executes and writes portions of testing plans,...",2
1,1,Maintain Network Performance by assisting with...,3
2,2,Supports the regional compliance manager with ...,4
3,3,Keep up to date with local and national busine...,1
4,4,Assist with Service Organization Control (SOC)...,4


In [22]:
def df_preprocessing(csv_path, original_labels='jobflag'):
    df = pd.read_csv(csv_path)

    # 外れ値の除去
    

    # ラベルのエンコード
    df['original_labels'] = df[original_labels]
    df = df.drop(columns=original_labels)
    le = LabelEncoder()
    le = le.fit(df['original_labels'])
    df['labels'] = le.transform(df['original_labels'])

    return df

def make_folded_df(csv_path, num_splits=5, seed=SEED):
    df = df_preprocessing(csv_path)
    df['kfold'] = np.nan

    label = df['labels'].tolist()

    skfold = StratifiedKFold(num_splits, shuffle=True, random_state=seed)
    for fold, (_, valid_indexes) in enumerate(skfold.split(range(len(label)), label)):
        for i in valid_indexes:
            df.iloc[i, df.columns.get_loc('kfold')] = fold

    return df

seed_set(SEED)
df = make_folded_df(TRAIN_FILE)
df.head()

Unnamed: 0,id,description,original_labels,labels,kfold
0,0,"Executes and writes portions of testing plans,...",2,1,0.0
1,1,Maintain Network Performance by assisting with...,3,2,3.0
2,2,Supports the regional compliance manager with ...,4,3,0.0
3,3,Keep up to date with local and national busine...,1,0,4.0
4,4,Assist with Service Organization Control (SOC)...,4,3,3.0


In [23]:
df[['original_labels', 'labels']].drop_duplicates()

Unnamed: 0,original_labels,labels
0,2,1
1,3,2
2,4,3
3,1,0


In [24]:
def make_dataset(df, tokenizer, device=DEVICE):
    dataset = nlp.Dataset.from_pandas(df)
    # datasetの準備
    dataset = dataset.map(
        lambda example: tokenizer(example["description"],
                                padding="max_length",
                                truncation=True,
                                max_length=128))
    dataset.set_format(type='torch',
                    columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
                    device=DEVICE
                    )
    return dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
dataset = make_dataset(df, tokenizer)

HBox(children=(FloatProgress(value=0.0, max=2931.0), HTML(value='')))




## model

In [25]:
class Classifier(nn.Module):
    def __init__(self, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output, _ = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids)
        output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output

## 学習

In [26]:
def train_fn(dataloader, model, criterion, optimizer, device, epoch):

    model.train()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    progress = tqdm(dataloader, total=len(dataloader))

    for i, batch in enumerate(progress):
        progress.set_description(f"<Train> Epoch{epoch+1}")

        attention_mask, input_ids, labels, token_type_ids = batch.values()

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(outputs, labels)  # 損失を計算
        _, preds = torch.max(outputs, 1)  # ラベルを予測

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_corrects += torch.sum(preds == labels)

        all_labels += labels.tolist()
        all_preds += preds.tolist()

        progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    train_loss = total_loss / len(dataloader)
    train_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)
    train_f1 = f1_score(all_labels, all_preds, average="macro")

    return train_loss, train_acc, train_f1

In [27]:
def eval_fn(dataloader, model, criterion, device, epoch):
    model.eval()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        progress = tqdm(dataloader, total=len(dataloader))
        
        for i, batch in enumerate(progress):
            progress.set_description(f"<Valid> Epoch{epoch+1}")

            attention_mask, input_ids, labels, token_type_ids = batch.values()

            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)

            total_loss += loss.item()
            total_corrects += torch.sum(preds == labels)

            all_labels += labels.tolist()
            all_preds += preds.tolist()

            progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    valid_loss = total_loss / len(dataloader)
    valid_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)

    valid_f1 = f1_score(all_labels, all_preds, average="macro")

    return valid_loss, valid_acc, valid_f1

In [28]:
def plot_training(train_losses, train_accs, train_f1s,
                  valid_losses, valid_accs, valid_f1s,
                  epoch, fold):
    
    loss_df = pd.DataFrame({"Train":train_losses,
                            "Valid":valid_losses},
                        index=range(1, epoch+2))
    loss_ax = sns.lineplot(data=loss_df).get_figure()
    loss_ax.savefig(f"./figures/loss_plot_fold={fold}.png", dpi=300)
    loss_ax.clf()

    acc_df = pd.DataFrame({"Train":train_accs,
                           "Valid":valid_accs},
                          index=range(1, epoch+2))
    acc_ax = sns.lineplot(data=acc_df).get_figure()
    acc_ax.savefig(f"./figures/acc_plot_fold={fold}.png", dpi=300)
    acc_ax.clf()

    f1_df = pd.DataFrame({"Train":train_f1s,
                          "Valid":valid_f1s},
                         index=range(1, epoch+2))
    f1_ax = sns.lineplot(data=f1_df).get_figure()
    f1_ax.savefig(f"./figures/f1_plot_fold={fold}.png", dpi=300)
    f1_ax.clf()

In [29]:
def trainer(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_dataset = make_dataset(train_df, tokenizer, DEVICE)
    valid_dataset = make_dataset(valid_df, tokenizer, DEVICE)

    train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

    model = Classifier(num_classes=NUM_CLASSES)
    model = model.to(DEVICE)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)

    train_losses = []
    train_accs = []
    train_f1s = []
    valid_losses = []
    valid_accs = []
    valid_f1s = []

    best_loss = np.inf
    best_acc = 0
    best_f1 = 0

    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_fn(train_dataloader, model, criterion, optimizer, DEVICE, epoch)
        valid_loss, valid_acc, valid_f1 = eval_fn(valid_dataloader, model, criterion, DEVICE, epoch)
        print(f"Loss: {valid_loss}  Acc: {valid_acc}  f1: {valid_f1}  ", end="")

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        valid_f1s.append(valid_f1)

        plot_training(train_losses, train_accs, train_f1s,
                      valid_losses, valid_accs, valid_f1s,
                      epoch, fold)
        
        best_loss = valid_loss if valid_loss < best_loss else best_loss
        besl_acc = valid_acc if valid_acc > best_acc else best_acc
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            print("model saving!", end="")
            torch.save(model.state_dict(), MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth")
        print("\n")

    return best_f1

# 学習

In [30]:
seed_set(SEED)
df = make_folded_df(TRAIN_FILE, NUM_SPLITS)
f1_scores = []
for fold in range(NUM_SPLITS):
    print(f"fold {fold}", "="*80)
    f1 = trainer(fold, df)
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")



HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=587.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))

  return function(data_struct)





HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.9313422441482544  Acc: 0.645655877342419  f1: 0.563009378401895  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.8718214631080627  Acc: 0.686541737649063  f1: 0.5988524909970732  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.960471773147583  Acc: 0.6320272572402045  f1: 0.5670423123079  



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 1.0203007578849792  Acc: 0.6592844974446337  f1: 0.5976123213886136  



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 1.116522216796875  Acc: 0.645655877342419  f1: 0.5787999921477989  

<fold=0> best score: 0.5988524909970732



HBox(children=(FloatProgress(value=0.0, max=2345.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=586.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.8983591556549072  Acc: 0.6416382252559727  f1: 0.48247193323215404  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.9062906265258789  Acc: 0.6706484641638225  f1: 0.565953092437281  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.9928636074066162  Acc: 0.6604095563139932  f1: 0.5571529527237243  



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 1.0285635709762573  Acc: 0.6723549488054608  f1: 0.5896148955118384  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 1.323345708847046  Acc: 0.6450511945392492  f1: 0.5753726277374702  

<fold=1> best score: 0.5896148955118384



HBox(children=(FloatProgress(value=0.0, max=2345.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=586.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.860054349899292  Acc: 0.6723549488054608  f1: 0.5656367372433594  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.8349729776382446  Acc: 0.6928327645051194  f1: 0.5644601678168147  



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.8855448603630066  Acc: 0.689419795221843  f1: 0.6038127931401198  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.9972138166427612  Acc: 0.6860068259385665  f1: 0.6115682274468761  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 1.137154221534729  Acc: 0.6791808873720137  f1: 0.6056803792581611  

<fold=2> best score: 0.6115682274468761



HBox(children=(FloatProgress(value=0.0, max=2345.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=586.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.9220407843589783  Acc: 0.658703071672355  f1: 0.5296805316472252  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.8400395274162292  Acc: 0.689419795221843  f1: 0.6053241090775494  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.9071160078048706  Acc: 0.6774744027303754  f1: 0.5996184484697487  



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.9549265027046203  Acc: 0.6757679180887372  f1: 0.6010594193186913  



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 1.2424732208251954  Acc: 0.6450511945392492  f1: 0.606068515071446  model saving!

<fold=3> best score: 0.606068515071446



HBox(children=(FloatProgress(value=0.0, max=2345.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=586.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.8866625547409057  Acc: 0.6501706484641638  f1: 0.5525459683670787  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.8517239809036254  Acc: 0.658703071672355  f1: 0.5688085689444384  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 0.8900692343711853  Acc: 0.689419795221843  f1: 0.5918822972232586  model saving!



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 1.0090437531471252  Acc: 0.6689419795221843  f1: 0.5821095045449446  



HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Loss: 1.039434051513672  Acc: 0.6860068259385665  f1: 0.6142271728824954  model saving!

<fold=4> best score: 0.6142271728824954



<Figure size 432x288 with 0 Axes>

In [31]:
cv = sum(f1_scores) / len(f1_scores)
print(f"CV: {cv}")

CV: 0.6040662603819459


In [34]:
lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
with open(f"./result/{MODEL_NAME}_result.txt", mode='w') as f:
    f.write(lines)

## 推論

In [36]:
models = []
for fold in range(NUM_SPLITS):
    model = Classifier()
    model.load_state_dict(torch.load(MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth"))
    model.to(DEVICE)
    model.eval()
    models.append(model)

In [37]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_df = pd.read_csv(TEST_FILE)
test_df["labels"] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

HBox(children=(FloatProgress(value=0.0, max=1743.0), HTML(value='')))




In [38]:
with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = []

    for batch in progress:
        progress.set_description("<Test>")

        attention_mask, input_ids, labels, token_type_ids = batch.values()

        outputs = []
        for model in models:
            output = model(input_ids, attention_mask, token_type_ids)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().tolist()
        outputs = np.argmax(outputs, axis=1)

        final_output.extend(outputs)

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




In [40]:
submit = pd.read_csv("./dataset/submit_sample.csv", names=["id", "labels"])
submit["labels"] = final_output
submit["labels"] = submit["labels"] + 1
try:
    submit.to_csv("./output/submission_cv{}.csv".format(str(cv).replace(".", "")[:10]), index=False, header=False)
except NameError:
    submit.to_csv("./output/submission.csv", index=False, header=False)
submit.head()

Unnamed: 0,id,labels
0,2931,4
1,2932,3
2,2933,3
3,2934,1
4,2935,3
