In [None]:
import sys
import os
import time
import random
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, AdamW
from transformers import (AutoModel, AutoModelForMaskedLM, AutoTokenizer, 
    LineByLineTextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, get_linear_schedule_with_warmup)

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

class CFG:
    model_name_or_path = '../input/roberta-transformers-pytorch/roberta-base'
    #model_name_or_path = '../input/huggingface-bert-variants/bert-base-cased/bert-base-cased/'
    clip_by_train_range = False
    batch_size = 8
    max_seq_length = 512
    seq_length = 100
    learning_rate = 2e-5
    use_lr_scheduler = True
    mid_eval = True
    mid_eval_step_num = 50
    random_seed = 2021
    model_output_dir = './'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
cfg = CFG()
seed_everything(cfg.random_seed)
QUICK_CHECK = True

global_start_t = time.time()
print('ok')

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
if QUICK_CHECK:
    train_data = train_data.sample(300).reset_index(drop=True)

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'], bins=num_bins, labels=False)
target = train_data['target'].to_numpy()
bins = train_data['bins'].to_numpy()
train_target_min, train_target_max = train_data['target'].min(), train_data['target'].max()

print(f'train_data.shape: {train_data.shape}, test_data.shape: {test_data.shape}')
train_target_min, train_target_max
# (-3.676267773, 1.7113898269999999)

In [None]:
train_data['bins'].value_counts()

In [None]:
test_data.head(10)

In [None]:
class CLRP_Dataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.df = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        excerpt = self.df.iloc[index]['excerpt']
        d_encode = self.tokenizer(excerpt,
                                  padding="max_length",
                                  max_length=self.max_seq_length,
                                  truncation=True)
        return {"input_ids": d_encode['input_ids'],
                "attention_mask": d_encode['attention_mask'],
                "length" : sum(d_encode['attention_mask'])}

    
def collate_fn_test(batch):
    max_len = max([x['length'] for x in batch])
    input_ids = torch.tensor([x['input_ids'][:max_len] for x in batch])
    attention_mask = torch.tensor([x['attention_mask'][:max_len] for x in batch])
    #token_type_ids = torch.tensor([x['token_type_ids'][:max_len] for x in batch])
    
    if 'token_type_ids' in batch[0]:
        token_type_ids = torch.tensor([x['token_type_ids'][:max_len] for x in batch])
        return {"input_ids": input_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids}
    else:
        return {"input_ids": input_ids,
                "attention_mask": attention_mask}
    
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name_or_path)
cfg.tokenizer = tokenizer

ds_test = CLRP_Dataset(test_data, tokenizer, cfg.max_seq_length)
dl_test = DataLoader(ds_test, batch_size=2*cfg.batch_size, shuffle=False, collate_fn=collate_fn_test, num_workers=0)
print('len of ds_test: ', len(ds_test), 'len of dl_test: ', len(dl_test))

print('ok')

In [None]:
class CLRP_model_V1(nn.Module):
    def __init__(self, pretrained_model_path, embedding_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        use_large_model = True if 'large' in pretrained_model_path else False
        self.drop_out = nn.Dropout(0.1)
        if use_large_model:
            self.fc1 = nn.Linear(1024*2, embedding_dim)
        else:
            self.fc1 = nn.Linear(768*2, embedding_dim)
        self.activation1 = nn.ReLU()
        self.fc2 = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None):
        output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
        assert len(output)==2
        
        last_hidden_state, pooled_output = output['last_hidden_state'], output['pooler_output']
        last_hidden_state = self.drop_out(last_hidden_state)
        seq_avg = torch.mean(last_hidden_state, dim=1)
        seq_max = torch.max(last_hidden_state, dim=1)[0]
        concat_out = torch.cat((seq_avg, seq_max), dim=1)
        preds = self.fc2(self.activation1(self.fc1(concat_out)))        
        preds = preds.squeeze(-1).squeeze(-1)  ### 这一行非常非常关键！！！
        
        return preds
    
class CLRP_model_V2(nn.Module):
    def __init__(self, pretrained_model_path, embedding_dim=256):
        super().__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        use_large_model = True if 'large' in pretrained_model_path else False
        self.drop_out = nn.Dropout(0.1)
        if use_large_model:
            self.fc1 = nn.Linear(1024*1, 1)
        else:
            self.fc1 = nn.Linear(768*1, 1)
        

    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None):
        output = self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
        assert len(output)==2
        
        pooled_output = output['pooler_output']
        pooled_output = self.drop_out(pooled_output)
        preds = self.fc1(pooled_output)
        preds = preds.squeeze(-1).squeeze(-1)  ### 这一行非常非常关键！！！
        
        return preds
    
CLRP_model = CLRP_model_V2

model = CLRP_model(cfg.model_name_or_path)
model.to(cfg.device)

# model_path = '../input/clrp-pytorch-roberta-base-my-first-model-train/best_model.pth'
# model.load_state_dict(torch.load(model_path))

model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_param_num: ', model_param_num, 'model_trainable_param_num: ', 
      model_trainable_param_num)
del model
            
print('ok')

In [None]:
def get_embeddings(df, model_path=cfg.model_name_or_path):
    print('get into get_embeddings()')
    global model, tokenizer, cfg, collate_fn_test
    
    model = CLRP_model(cfg.model_name_or_path)
    model.to(cfg.device)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    ds = CLRP_Dataset(df, tokenizer, cfg.max_seq_length)
    dl = DataLoader(ds, batch_size=2*cfg.batch_size, shuffle=False, 
                    collate_fn=collate_fn_test, num_workers=0)

    embeddings = []
    with torch.no_grad():
        for i, inputs in enumerate(dl):
            inputs = {key:val.reshape(val.shape[0],-1).to(cfg.device) for key,val in inputs.items()}
            outputs = model.bert(**inputs, return_dict=True)
            outputs = outputs['pooler_output'].detach().cpu().numpy()
            embeddings.extend(outputs)
    del model
    return np.array(embeddings)

def get_preds_svm(X, y, X_test, bins=bins, nfolds=5, C=10, kernel='rbf', gamma='auto'):
    print('*****'*10 + ' in get_preds_svm() ' + '*****'*10)
    print(f'C: {C}, kernel: {kernel}, gamma: {gamma}')
    scores = []
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):
        model = SVR(C=C, kernel=kernel, gamma=gamma)
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train, y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction, y_valid)
        print(f'Fold {k}, rmse_score: {score:.7f}')
        scores.append(score)
        preds += model.predict(X_test)
        
    mean_valid_rmse = np.mean(scores)
    params = {'C': C, 'kernel': kernel, 'gamma': gamma}
    print(f'mean_valid_rmse: {mean_valid_rmse:.7f}')
    print('*****'*22)
    return mean_valid_rmse, np.array(preds) / nfolds, params

print('ok')

In [None]:
train_embeddings1 = get_embeddings(train_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold0.pth')
test_embeddings1 = get_embeddings(test_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold0.pth')
print(f'train_embeddings1.shape: {train_embeddings1.shape}  test_embeddings1.shape: {test_embeddings1.shape}')

train_embeddings2 = get_embeddings(train_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold1.pth')
test_embeddings2 = get_embeddings(test_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold1.pth')
print(f'train_embeddings2.shape: {train_embeddings2.shape}  test_embeddings2.shape: {test_embeddings2.shape}')

train_embeddings3 = get_embeddings(train_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold2.pth')
test_embeddings3 = get_embeddings(test_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold2.pth')
print(f'train_embeddings3.shape: {train_embeddings3.shape}  test_embeddings3.shape: {test_embeddings3.shape}')

train_embeddings4 = get_embeddings(train_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold3.pth')
test_embeddings4 = get_embeddings(test_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold3.pth')
print(f'train_embeddings4.shape: {train_embeddings4.shape}  test_embeddings4.shape: {test_embeddings4.shape}')

train_embeddings5 = get_embeddings(train_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold4.pth')
test_embeddings5 = get_embeddings(test_data, '../input/clrp-roberta-base-my-first-model-5fold-cv/best_model_fold4.pth')
print(f'train_embeddings5.shape: {train_embeddings5.shape}  test_embeddings5.shape: {test_embeddings5.shape}')

In [None]:
train_embeddings = np.hstack([train_embeddings1, train_embeddings2, train_embeddings3, train_embeddings4, train_embeddings5])
test_embeddings = np.hstack([test_embeddings1, test_embeddings2, test_embeddings3, test_embeddings4, test_embeddings5])

train_embeddings.shape, test_embeddings.shape

In [None]:
best_mean_valid_rmse, best_svm_preds = 1e10, None
best_params = None

for C in (1, 1e1, 1e2, 1e3):
    for kernel in ('poly', 'rbf', 'sigmoid'):
        for gamma in ['auto']:
            mean_valid_rmse, svm_preds, params = get_preds_svm(train_embeddings, target, 
                                                                test_embeddings, C=C, kernel=kernel, 
                                                                gamma=gamma)
            if mean_valid_rmse < best_mean_valid_rmse:
                best_mean_valid_rmse = mean_valid_rmse
                best_svm_preds = svm_preds
                best_params = params
                print(f'get new best_mean_valid_rmse: {mean_valid_rmse:.7f}, best_params: {best_params}')
                
print(f'final best_mean_valid_rmse: {best_mean_valid_rmse:.7f}, best_params: {best_params}')
# final best_mean_valid_rmse: 0.3033951, best_params: {'C': 10.0, 'kernel': 'rbf', 'gamma': 'auto'}
# final best_mean_valid_rmse: 0.2524015, best_params: {'C': 10.0, 'kernel': 'sigmoid', 'gamma': 'auto'}

In [None]:
submission_df = test_data[['id']]
submission_df['target'] = best_svm_preds
submission_df.to_csv('./submission.csv', index=False)

print('total cost time: ', time.time() - global_start_t)

In [None]:
submission_df.head(10)