In [18]:
debug = False

In [15]:
import numpy as np 
import pandas as pd 
# import pandas_profiling as pdp 
import os 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm 
import matplotlib.pyplot as plt 
import transformers
from transformers import AdamW, AutoModel, AutoTokenizer, AutoModelForSequenceClassification, get_constant_schedule_with_warmup, get_linear_schedule_with_warmup
import random 
from sklearn.metrics import mean_squared_error
import warnings
warnings.simplefilter('ignore')
scaler = torch.cuda.amp.GradScaler()
if torch.cuda.is_available():
    print('[INFO] Using GPU: {}\n'.format(torch.cuda.get_device_name()))
    device = torch.device('cuda:0')
else:
    print('\n[INFO] GPU not fount. Using CPU: {}'.format(platform.processor()))
    device = torch.device('cpu')

[INFO] Using GPU: GeForce RTX 3090



In [3]:
SEED = 42

def seed_everything(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [4]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
sample = pd.read_csv('./input/sample_submission.csv')

In [5]:
train = train.sort_values('target').reset_index(drop=True)
train['kfold'] = train.index % 5

p_train = train[train['kfold']!=0].reset_index(drop=True)
p_valid = train[train['kfold']==0].reset_index(drop=True)

In [6]:
class BERTDataset(Dataset):
    def __init__(self, sentences, targets):
        self.sentences = sentences 
        self.targets = targets
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, idx):
        sentence = self.sentences[idx] 
        bert_sens = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens=True,
                        max_length=max_sense,
                        pad_to_max_length=True,
                        return_attention_mask=True)
        ids = torch.tensor(bert_sens['input_ids'], dtype=torch.long)
        mask = torch.tensor(bert_sens['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(bert_sens['token_type_ids'], dtype=torch.long)
        
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': target
        }
    

In [7]:
train_dataset = BERTDataset(p_train['excerpt'], p_train['target'])
valid_dataset = BERTDataset(p_valid['excerpt'], p_valid['target'])

In [8]:
train_batch = 16
valid_batch = 32

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=train_batch, shuffle=True, num_workers=4, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=valid_batch, shuffle=False, num_workers=4, pin_memory=True)

In [10]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
from transformers import AdamW
LR = 2e-5
optimizer = AdamW(model.parameters(), LR, betas=(0.9, 0.99), weight_decay=1e-2)

In [13]:
def loss_fn(output, target):
    return torch.sqrt(nn.MSELoss()(output, target))

In [16]:
def training(train_dataloader, model, optimizer, scheduler):
    model.train()
    all_preds = []
    all_targets = []

    for a in train_dataloader:
        losses = []

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            ids = a['ids'].to(device, non_blocking=True)
            mask = a['mask'].to(device, non_blocking=True)
            tokentype = a['token_type_ids'].to(device, non_blocking=True)

            output = model(ids, mask)
            output = output['logits'].squeeze(-1)

            target = a['targets'].to(device, non_blocking=True)
            loss = loss_fn(output, target)

            losses.append(loss.item())
            all_preds.append(output.detach().cpu().numpy())
            all_targets.append(target.detach().squeeze(-1).cpu().numpy())
        scaler.scale(loss).backward() # lossのバックワード
        scaler.step(optimizer) # オプティマイザーの更新
        scaler.update() # スケーラーの更新

        scheduler.step() # 学習率の更新
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)

    losses = np.mean(losses)
    train_rme_loss = np.sqrt(mean_squared_error(all_targets, all_preds))

    return losses, train_rme_loss

In [19]:
if debug:
    train_dataloader = DataLoader(train_dataset, batch_size=train_batch, shuffle=True, num_workers=4, pin_memory=True)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)
    losses, train_rme_loss = training(train_dataloader, model, optimizer, scheduler)
    print(losses, train_rme_loss)

In [20]:
def validating(valid_dataloader, model):
    model.eval()
    all_preds = []
    all_targets = []

    for a in valid_dataloader:
        losses = []

        with torch.no_grad():
            ids = a['ids'].to(device)
            mask = a['mask'].to(device)
            tokentype = a['token_type_ids'].to(device)

            output = model(ids, mask)
            output = output['logits'].squeeze(-1)
            target = a['targets'].to(device)

            loss = loss_fn(output, target)

            losses.append(loss.item())
            all_preds.append(output.detach().cpu().numpy())
            all_targets.append(target.detach().squeeze(-1).cpu().numpy())

            del loss

    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)

    losses = np.mean(losses)
    valid_rme_loss = np.sqrt(mean_squared_error(all_targets, all_preds))

    return all_preds, losses, valid_rme_loss

In [21]:
if debug:
    all_preds, losses, valid_rme_loss = validating(valid_dataloader, model)
    print(all_preds[:3])
    print(losses)
    print(valid_rme_loss)

In [None]:
p_train = train[train['Kfold']!=0].reset_index(drop=True)
p_valid = train[train['Kfold']==0].reset_index(drop=True)

train_dataset = DataLoader(train_dataset, batch_size=train_batch, shuffle=True, num_workers=4, pin_memory=True)
valid_dataset = DataLoader(valid_dataset, batch_size=valid_batch, shuffle=False, num_workers=4, pin_memory=True)

model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

model.to(device)
LR = 2e-5
optimizer = AdamW(model.parameters(), lr=LR, betas=(0.9, 0.999), weight_decay=1e-2)
train_steps = int(len(p_train)/train_batch*epochs)
num_steps = int(train_steps*0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

In [None]:
p_train = train[train['Kfold']!=0].reset_index(drop=True)
p_valid = train[train['Kfold']==0].reset_index(drop=True)

train_dataset = DataLoader(train_dataset, batch_size=train_batch, shuffle=True, num_workers=4, pin_memory=True)
valid_dataset = DataLoader(valid_dataset, batch_size=valid_batch, shuffle=False, num_workers=4, pin_memory=True)

model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

model.to(device)
LR = 2e-5
optimizer = AdamW(model.parameters(), lr=LR, betas=(0.9, 0.999), weight_decay=1e-2)
train_steps = int(len(p_train)/train_batch*epochs)
num_steps = int(train_steps*0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

In [None]:
x = np.arange(epochs)
plt.plot(x, train_losses, label='train')
plt.plot(x, val_losses, label='valid')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()

In [None]:
x = np.arange(epochs)
plt.plot(x, train_scores, label='train')
plt.plot(x, valid_scores, label='valid')
plt.xlabel('epoch')
plt.ylabel('score')
plt.legend()

In [None]:
bestscores = []
bestscores.append(bestscore)

for fold in range(1,5):
    

    # initializing the data

    p_train = train[train["Kfold"]!=fold].reset_index(drop=True)
    p_valid = train[train["Kfold"]==fold].reset_index(drop=True)


    train_dataset = BERTDataset(p_train["excerpt"],p_train["target"])
    valid_dataset = BERTDataset(p_valid["excerpt"],p_valid["target"])

    train_dataloader = DataLoader(train_dataset,batch_size=train_batch,shuffle = True,num_workers=4,pin_memory=True)
    valid_dataloader = DataLoader(valid_dataset,batch_size=valid_batch,shuffle = False,num_workers=4,pin_memory=True)

    model = transformers.BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=1)

    model.to(device)
    LR=2e-5
    optimizer = AdamW(model.parameters(), LR,betas=(0.9, 0.999), weight_decay=1e-2) # AdamW optimizer

    train_steps = int(len(p_train)/train_batch*epochs)

    num_steps = int(train_steps*0.1)

    scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)


    trainlosses = []
    vallosses = []
    bestscore = None

    trainscores = []
    validscores = []

    for epoch in tqdm(range(epochs)):

        print("---------------" + str(epoch) + "start-------------")

        trainloss,trainscore = training(train_dataloader,model,optimizer,scheduler)

        trainlosses.append(trainloss)
        trainscores.append(trainscore)

        print("trainscore is " + str(trainscore))

        preds,validloss,valscore=validating(valid_dataloader,model)

        vallosses.append(validloss)
        validscores.append(valscore)


        print("valscore is " + str(valscore))

        if bestscore is None:
            bestscore = valscore

            print("Save first model")

            state = {
                            'state_dict': model.state_dict(),
                            'optimizer_dict': optimizer.state_dict(),
                            "bestscore":bestscore
                        }


            torch.save(state, "model" + str(fold) + ".pth")

        elif bestscore > valscore:

            bestscore = valscore

            print("found better point")

            state = {
                            'state_dict': model.state_dict(),
                            'optimizer_dict': optimizer.state_dict(),
                            "bestscore":bestscore
                        }


            torch.save(state, "model"+ str(fold) + ".pth")

        else:
            pass


    bestscores.append(bestscore)

In [None]:
bestscores

In [None]:
np.mean(bestscores)